Skip to content

Commit 1a3aed6

Browse files
committed
Refactor get_col_names()
Use parse() and move to utility header
1 parent dbe009f commit 1a3aed6

9 files changed

Lines changed: 56 additions & 70 deletions

File tree

include/internal/csv_reader.cpp

Lines changed: 4 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -5,48 +5,6 @@
55
#include "csv_reader.hpp"
66

77
namespace csv {
8-
namespace internals {
9-
CSV_INLINE std::string format_row(const std::vector<std::string>& row, csv::string_view delim) {
10-
/** Print a CSV row */
11-
std::stringstream ret;
12-
for (size_t i = 0; i < row.size(); i++) {
13-
ret << row[i];
14-
if (i + 1 < row.size()) ret << delim;
15-
else ret << '\n';
16-
}
17-
ret.flush();
18-
19-
return ret.str();
20-
}
21-
22-
/** Return the selected header row from a parsed head buffer. */
23-
CSV_INLINE std::vector<std::string> _get_col_names(csv::string_view head, CSVFormat format) {
24-
// Parse the CSV
25-
auto trim_chars = format.get_trim_chars();
26-
std::stringstream source(head.data());
27-
RowCollection rows;
28-
29-
StreamParser<std::stringstream> parser(source, format);
30-
parser.set_output(rows);
31-
parser.next();
32-
33-
return CSVRow(std::move(rows[format.get_header()]));
34-
}
35-
}
36-
37-
/** Return a CSV's column names. */
38-
CSV_INLINE std::vector<std::string> get_col_names(csv::string_view filename, CSVFormat format) {
39-
auto head = internals::get_csv_head(filename);
40-
41-
/** Guess delimiter and header row */
42-
if (format.guess_delim()) {
43-
auto guess_result = guess_format(filename, format.get_possible_delims());
44-
format.delimiter(guess_result.delim).header_row(guess_result.header_row);
45-
}
46-
47-
return internals::_get_col_names(head, format);
48-
}
49-
508
/** Reads an arbitrarily large CSV file using memory-mapped IO.
519
*
5210
* **Details:** Reads the first block of a CSV file synchronously to get information
@@ -104,11 +62,8 @@ namespace csv {
10462

10563
/** Return the CSV's column names as a vector of strings. */
10664
CSV_INLINE std::vector<std::string> CSVReader::get_col_names() const {
107-
if (this->col_names) {
108-
return this->col_names->get_col_names();
109-
}
110-
111-
return std::vector<std::string>();
65+
return (this->col_names) ? this->col_names->get_col_names() :
66+
std::vector<std::string>();
11267
}
11368

11469
/** Return the index of the column name if found or
@@ -263,9 +218,9 @@ namespace csv {
263218

264219
if (policy == VariableColumnPolicy::THROW) {
265220
if (errored_row.size() < this->n_cols)
266-
throw std::runtime_error("Line too short " + internals::format_row(errored_row));
221+
throw std::runtime_error("Line too short " + std::string(errored_row.raw_str()));
267222

268-
throw std::runtime_error("Line too long " + internals::format_row(errored_row));
223+
throw std::runtime_error("Line too long " + std::string(errored_row.raw_str()));
269224
}
270225

271226
continue;

include/internal/csv_reader.hpp

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,6 @@
2727

2828
/** The all encompassing namespace */
2929
namespace csv {
30-
/** Stuff that is generally not of interest to end-users */
31-
namespace internals {
32-
std::string format_row(const std::vector<std::string>& row, csv::string_view delim = ", ");
33-
34-
std::vector<std::string> _get_col_names( csv::string_view head, const CSVFormat format = CSVFormat::guess_csv());
35-
}
36-
37-
std::vector<std::string> get_col_names(
38-
csv::string_view filename,
39-
const CSVFormat format = CSVFormat::guess_csv());
40-
4130
#if CSV_ENABLE_THREADS
4231
inline void join_worker(std::thread& worker) {
4332
if (worker.joinable()) worker.join();
@@ -386,11 +375,16 @@ namespace csv {
386375
template<typename TStream,
387376
csv::enable_if_t<std::is_base_of<std::istream, TStream>::value, int> = 0>
388377
void init_from_stream(TStream& source, CSVFormat format) {
378+
// C4316: StreamParser may have over-aligned SIMD members; heap allocation
379+
// alignment is handled correctly at runtime via the allocator on supported
380+
// platforms. Suppress the MSVC false-positive here.
381+
CSV_MSVC_PUSH_DISABLE(4316)
389382
this->init_parser(
390383
std::unique_ptr<internals::IBasicCSVParser>(
391384
new internals::StreamParser<TStream>(source, format, this->col_names)
392385
)
393386
);
387+
CSV_MSVC_POP
394388
}
395389

396390
/** Read initial chunk to get metadata */

include/internal/csv_row.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,17 @@ namespace csv {
6363
return ret;
6464
}
6565

66+
CSV_INLINE csv::string_view CSVRow::raw_str() const noexcept {
67+
if (!data) return csv::string_view();
68+
const csv::string_view full = data->data;
69+
if (data_start >= full.size()) return csv::string_view();
70+
const size_t end = full.find('\n', data_start);
71+
const size_t len = (end == csv::string_view::npos)
72+
? (full.size() - data_start)
73+
: (end - data_start);
74+
return full.substr(data_start, len);
75+
}
76+
6677
/** Build a map from column names to values for a given row. */
6778
CSV_INLINE std::unordered_map<std::string, std::string> CSVRow::to_unordered_map() const {
6879
std::unordered_map<std::string, std::string> row_map;

include/internal/csv_row.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,15 @@ namespace csv {
363363
* it materializes all fields as owning strings.
364364
*/
365365
operator std::vector<std::string>() const;
366+
367+
/** Return a string_view of the raw bytes of this row as they appear in
368+
* the underlying parse buffer, up to (but not including) the trailing
369+
* newline character.
370+
*
371+
* @warning The view is only valid for as long as the CSVRow (and its
372+
* associated data chunk) remains alive.
373+
*/
374+
csv::string_view raw_str() const noexcept;
366375
///@}
367376

368377
/** A random access iterator over the contents of a CSV row.

include/internal/csv_stat.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ namespace csv {
153153
}
154154
}
155155
else if (this->reader.get_format().get_variable_column_policy() == VariableColumnPolicy::THROW) {
156-
throw std::runtime_error("Line has different length than the others " + internals::format_row(*current_record));
156+
throw std::runtime_error("Line has different length than the others " + std::string(current_record->raw_str()));
157157
}
158158

159159
++current_record;

include/internal/csv_utility.hpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ namespace csv {
3434
* @par Example
3535
* @snippet tests/test_read_csv.cpp Parse Example
3636
*/
37-
inline CSVReader parse(csv::string_view in, CSVFormat format = CSVFormat::guess_csv()) {
37+
inline CSVReader parse(csv::string_view in, const CSVFormat& format = CSVFormat::guess_csv()) {
3838
std::unique_ptr<std::istream> ss(new std::stringstream(std::string(in)));
3939
return CSVReader(std::move(ss), format);
4040
}
@@ -105,11 +105,20 @@ namespace csv {
105105
};
106106
}
107107

108+
/** Get the column names of a CSV file using just the first 500KB. */
109+
inline std::vector<std::string> get_col_names(
110+
csv::string_view filename,
111+
const CSVFormat& format = CSVFormat::guess_csv()) {
112+
auto head = internals::get_csv_head(filename);
113+
return parse_unsafe(head, format).get_col_names();
114+
}
115+
108116
/** Find the position of a column in a CSV file or CSV_NOT_FOUND otherwise. */
109-
inline int get_col_pos(csv::string_view filename, csv::string_view col_name,
117+
inline long long get_col_pos(csv::string_view filename, csv::string_view col_name,
110118
const CSVFormat& format = CSVFormat::guess_csv()) {
111-
CSVReader reader(filename, format);
112-
return reader.index_of(col_name);
119+
auto col_names = get_col_names(filename, format);
120+
return col_names.empty() ? CSV_NOT_FOUND :
121+
std::distance(col_names.begin(), std::find(col_names.begin(), col_names.end(), col_name));
113122
}
114123
///@}
115124
}

programs/csv_info.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,12 @@ int main(int argc, char** argv) {
1212
std::string file = argv[1];
1313
auto info = get_file_info(file);
1414

15-
std::cout << file << std::endl
16-
<< "Columns: " << internals::format_row(info.col_names, ", ")
15+
std::cout << file << std::endl << "Columns: ";
16+
for (size_t i = 0; i < info.col_names.size(); i++) {
17+
if (i) std::cout << ", ";
18+
std::cout << info.col_names[i];
19+
}
20+
std::cout << std::endl
1721
<< "Dimensions: " << info.n_rows << " rows x " << info.n_cols << " columns" << std::endl
1822
<< "Delimiter: " << info.delim << std::endl;
1923

single_include_test/file1.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,12 @@ int foobar(int argc, char** argv) {
1212
std::string file = argv[1];
1313
auto info = get_file_info(file);
1414

15-
std::cout << file << std::endl
16-
<< "Columns: " << internals::format_row(info.col_names, ", ")
15+
std::cout << file << std::endl << "Columns: ";
16+
for (size_t i = 0; i < info.col_names.size(); i++) {
17+
if (i) std::cout << ", ";
18+
std::cout << info.col_names[i];
19+
}
20+
std::cout << std::endl
1721
<< "Dimensions: " << info.n_rows << " rows x " << info.n_cols << " columns" << std::endl
1822
<< "Delimiter: " << info.delim << std::endl;
1923

tests/test_read_csv_file.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ using std::string;
1515

1616
#ifndef __EMSCRIPTEN__
1717
TEST_CASE("col_pos() Test", "[test_col_pos]") {
18-
int pos = get_col_pos(
18+
auto pos = get_col_pos(
1919
"./tests/data/real_data/2015_StateDepartment.csv",
2020
"Entity Type");
2121
REQUIRE(pos == 1);

0 commit comments

Comments
 (0)