Skip to content

Commit 4a1ccba

Browse files
committed
Improved empty row handling
1 parent cdc978f commit 4a1ccba

7 files changed

Lines changed: 187 additions & 32 deletions

File tree

include/internal/basic_csv_parser.cpp

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,24 @@ namespace csv {
2525
return get_csv_head(filename, get_file_size(filename));
2626
}
2727

28+
CSV_INLINE size_t infer_n_cols_from_head(csv::string_view head, CSVFormat format) {
29+
std::stringstream source(head.data());
30+
RowCollection rows;
31+
32+
StreamParser<std::stringstream> parser(source, format);
33+
parser.set_output(rows);
34+
parser.next();
35+
36+
for (size_t i = 0; i < rows.size(); i++) {
37+
auto& row = rows[i];
38+
if (row.size() > 0) {
39+
return row.size();
40+
}
41+
}
42+
43+
return 0;
44+
}
45+
2846
CSV_INLINE std::string get_csv_head(csv::string_view filename, size_t file_size) {
2947
const size_t bytes = 500000;
3048

@@ -158,18 +176,27 @@ namespace csv {
158176
this->data_pos_++;
159177
break;
160178

179+
case ParseFlags::CARRIAGE_RETURN:
180+
// Handles CRLF (we do not advance by 2 here, the NEWLINE case will handle it)
181+
if (this->data_pos_ + 1 < in.size() && parse_flag(in[this->data_pos_ + 1]) == ParseFlags::NEWLINE) {
182+
this->data_pos_++;
183+
}
184+
185+
// Intentionally fall through to handle the newline in the next case
186+
// If CR (old Mac style newline), fallthrough handles it
187+
161188
case ParseFlags::NEWLINE:
162189
this->data_pos_++;
163190

164-
// Catches CRLF (or LFLF, CRCRLF, or any other non-sensical combination of newlines)
165-
while (this->data_pos_ < in.size() && parse_flag(in[this->data_pos_]) == ParseFlags::NEWLINE)
166-
this->data_pos_++;
167-
168-
// End of record -> Write non-empty record
169-
if (this->field_length_ > 0 || !this->current_row_.empty()) {
191+
// End of record. Preserve intentional empty fields such as
192+
// trailing delimiters and quoted empty strings, but leave a
193+
// truly blank line as an empty row.
194+
if (this->field_length_ > 0
195+
|| this->field_start_ != UNINITIALIZED_FIELD
196+
|| !this->current_row_.empty()) {
170197
this->push_field();
171-
this->push_row();
172198
}
199+
this->push_row();
173200

174201
// Reset
175202
this->current_row_ = CSVRow(data_ptr_, this->data_pos_, fields_->size());

include/internal/basic_csv_parser.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ namespace csv {
4343
CSV_CONST CONSTEXPR_17 ParseFlagMap make_parse_flags(char delimiter) {
4444
auto ret = arrayToDefault<ParseFlagMap>(ParseFlags::NOT_SPECIAL);
4545
ret[delimiter + CHAR_OFFSET] = ParseFlags::DELIMITER;
46-
ret['\r' + CHAR_OFFSET] = ParseFlags::NEWLINE;
46+
ret['\r' + CHAR_OFFSET] = ParseFlags::CARRIAGE_RETURN;
4747
ret['\n' + CHAR_OFFSET] = ParseFlags::NEWLINE;
4848
return ret;
4949
}
@@ -101,6 +101,8 @@ namespace csv {
101101
CSV_INLINE size_t get_file_size(csv::string_view filename);
102102

103103
CSV_INLINE std::string get_csv_head(csv::string_view filename);
104+
105+
CSV_INLINE size_t infer_n_cols_from_head(csv::string_view head, CSVFormat format);
104106
}
105107

106108
/** Standard type for storing collection of rows */

include/internal/common.hpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,8 @@ namespace csv {
240240
QUOTE_ESCAPE_QUOTE = 0, /**< A quote inside or terminating a quote_escaped field */
241241
QUOTE = 2 | 1, /**< Characters which may signify a quote escape */
242242
NOT_SPECIAL = 4, /**< Characters with no special meaning or escaped delimiters and newlines */
243-
DELIMITER = 4 | 2, /**< Characters which signify a new field */
243+
DELIMITER = 4 | 1, /**< Characters which signify a new field */
244+
CARRIAGE_RETURN = 4 | 2, /**< Characters which signify a carriage return */
244245
NEWLINE = 4 | 2 | 1 /**< Characters which signify a new row */
245246
};
246247

@@ -252,7 +253,9 @@ namespace csv {
252253

253254
// Assumed to be true by parsing functions: allows for testing
254255
// if an item is DELIMITER or NEWLINE with a >= statement
256+
STATIC_ASSERT(ParseFlags::DELIMITER < ParseFlags::CARRIAGE_RETURN);
255257
STATIC_ASSERT(ParseFlags::DELIMITER < ParseFlags::NEWLINE);
258+
STATIC_ASSERT(ParseFlags::CARRIAGE_RETURN < ParseFlags::NEWLINE);
256259

257260
/** Optimizations for reducing branching in parsing loop
258261
*
@@ -262,11 +265,13 @@ namespace csv {
262265
STATIC_ASSERT(quote_escape_flag(ParseFlags::NOT_SPECIAL, false) == ParseFlags::NOT_SPECIAL);
263266
STATIC_ASSERT(quote_escape_flag(ParseFlags::QUOTE, false) == ParseFlags::QUOTE);
264267
STATIC_ASSERT(quote_escape_flag(ParseFlags::DELIMITER, false) == ParseFlags::DELIMITER);
268+
STATIC_ASSERT(quote_escape_flag(ParseFlags::CARRIAGE_RETURN, false) == ParseFlags::CARRIAGE_RETURN);
265269
STATIC_ASSERT(quote_escape_flag(ParseFlags::NEWLINE, false) == ParseFlags::NEWLINE);
266270

267271
STATIC_ASSERT(quote_escape_flag(ParseFlags::NOT_SPECIAL, true) == ParseFlags::NOT_SPECIAL);
268272
STATIC_ASSERT(quote_escape_flag(ParseFlags::QUOTE, true) == ParseFlags::QUOTE_ESCAPE_QUOTE);
269273
STATIC_ASSERT(quote_escape_flag(ParseFlags::DELIMITER, true) == ParseFlags::NOT_SPECIAL);
274+
STATIC_ASSERT(quote_escape_flag(ParseFlags::CARRIAGE_RETURN, true) == ParseFlags::NOT_SPECIAL);
270275
STATIC_ASSERT(quote_escape_flag(ParseFlags::NEWLINE, true) == ParseFlags::NOT_SPECIAL);
271276

272277
/** An array which maps ASCII chars to a parsing flag */

include/internal/csv_format.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ namespace csv {
2121
enum class VariableColumnPolicy {
2222
THROW = -1,
2323
IGNORE_ROW = 0,
24-
KEEP = 1
24+
KEEP = 1,
25+
KEEP_NON_EMPTY = 2
2526
};
2627

2728
/** Determines how column name lookups are performed */

include/internal/csv_reader.cpp

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,10 @@ namespace csv {
185185
if (!format.col_names.empty())
186186
this->set_col_names(format.col_names);
187187

188+
if (format.header < 0 && format.col_names.empty()) {
189+
this->n_cols = internals::infer_n_cols_from_head(head, format);
190+
}
191+
188192
this->parser = std::unique_ptr<Parser>(new Parser(filename, format, this->col_names)); // For C++11
189193
this->initial_read();
190194
#endif
@@ -352,18 +356,29 @@ namespace csv {
352356
this->_read_requested = true;
353357
continue;
354358
}
355-
else if (this->records->front().size() != this->n_cols &&
356-
this->_format.variable_column_policy != VariableColumnPolicy::KEEP) {
357-
auto errored_row = this->records->pop_front();
359+
else {
360+
const auto policy = this->_format.variable_column_policy;
361+
const size_t next_row_size = this->records->front().size();
362+
363+
if (policy == VariableColumnPolicy::KEEP_NON_EMPTY && next_row_size == 0) {
364+
this->records->pop_front();
365+
continue;
366+
}
358367

359-
if (this->_format.variable_column_policy == VariableColumnPolicy::THROW) {
360-
if (errored_row.size() < this->n_cols)
361-
throw std::runtime_error("Line too short " + internals::format_row(errored_row));
368+
if (next_row_size != this->n_cols &&
369+
(policy == VariableColumnPolicy::THROW || policy == VariableColumnPolicy::IGNORE_ROW)) {
370+
auto errored_row = this->records->pop_front();
362371

363-
throw std::runtime_error("Line too long " + internals::format_row(errored_row));
372+
if (policy == VariableColumnPolicy::THROW) {
373+
if (errored_row.size() < this->n_cols)
374+
throw std::runtime_error("Line too short " + internals::format_row(errored_row));
375+
376+
throw std::runtime_error("Line too long " + internals::format_row(errored_row));
377+
}
378+
379+
continue;
364380
}
365-
}
366-
else {
381+
367382
row = this->records->pop_front();
368383
this->_n_rows++;
369384
this->_read_requested = false; // Reset flag on successful read

include/internal/csv_reader.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,10 @@ namespace csv {
437437
this->set_col_names(format.col_names);
438438
}
439439

440+
if (format.header < 0 && format.col_names.empty()) {
441+
this->n_cols = internals::infer_n_cols_from_head(head, format);
442+
}
443+
440444
#ifdef _MSC_VER
441445
#pragma warning(push)
442446
#pragma warning(disable:4316)

tests/test_read_csv.cpp

Lines changed: 114 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,36 @@ TEST_CASE("Test Escaped Newline & Empty Last Column", "[read_csv_empty_last_colu
114114
vector<string>({ "4", "5", "6", ""}));
115115
}
116116

117+
TEST_CASE("Test Unquoted Trailing Empty Field", "[read_csv_empty_last_column]") {
118+
auto rows = "A,B,C,D\r\n" // Header row
119+
"1,2,3,\r\n"
120+
"4,5,6,"_csv;
121+
122+
CSVRow row;
123+
rows.read_row(row);
124+
REQUIRE(vector<string>(row) ==
125+
vector<string>({ "1", "2", "3", "" }));
126+
127+
rows.read_row(row);
128+
REQUIRE(vector<string>(row) ==
129+
vector<string>({ "4", "5", "6", "" }));
130+
}
131+
132+
TEST_CASE("Test Quoted Empty Single-Column Row", "[read_csv_empty_last_column]") {
133+
auto rows = "A\r\n" // Header row
134+
"\"\"\r\n"
135+
"value"_csv;
136+
137+
CSVRow row;
138+
rows.read_row(row);
139+
REQUIRE(vector<string>(row) ==
140+
vector<string>({ "" }));
141+
142+
rows.read_row(row);
143+
REQUIRE(vector<string>(row) ==
144+
vector<string>({ "value" }));
145+
}
146+
117147
TEST_CASE( "Test Empty Field", "[read_empty_field]" ) {
118148
// Per RFC 1480, escaped quotes should be doubled up
119149
auto rows = "A,B,C\r\n" // Header row
@@ -183,8 +213,7 @@ TEST_CASE( "Test leading and trailing escaped quote", "[read_csv_quote]" ) {
183213
}
184214
//! [Parse Example]
185215

186-
// Verify the CSV parser can handle any arbitrary line endings composed of carriage return & newline
187-
TEST_CASE("Cursed Newlines", "[read_csv_cursed_newline]") {
216+
TEST_CASE("Normal Newlines", "[read_csv_normal_newline]") {
188217
auto row_str = GENERATE(as<std::string> {},
189218
// Windows style
190219
"A,B,C\r\n" // Header row
@@ -198,20 +227,13 @@ TEST_CASE("Cursed Newlines", "[read_csv_cursed_newline]") {
198227
"1,2,3\n"
199228
"4,5,6",
200229

201-
// Eww brother what is that...
202-
"A,B,C\r\r\n" // Header row
203-
"123,234,345\r\r\n"
204-
"1,2,3\r\r\n"
205-
"4,5,6",
206-
207-
// Doubled-up Windows style (ridiculous: but I'm sure it exists somewhere)
208-
"A,B,C\r\n\r\n" // Header row
209-
"123,234,345\r\n\r\n"
210-
"1,2,3\r\n\r\n"
230+
// Old Mac Style
231+
"A,B,C\r" // Header row
232+
"123,234,345\r"
233+
"1,2,3\r"
211234
"4,5,6"
212235
);
213236

214-
// Set CSVFormat to KEEP all rows, even empty ones (because there shouldn't be any)
215237
CSVFormat format;
216238
format.header_row(0).variable_columns(VariableColumnPolicy::KEEP);
217239
auto rows = parse(row_str, format);
@@ -235,6 +257,85 @@ TEST_CASE("Cursed Newlines", "[read_csv_cursed_newline]") {
235257
REQUIRE(rows.n_rows() == 3);
236258
}
237259

260+
261+
TEST_CASE("Edge-Case Newlines", "[read_csv_edge_case_newline]") {
262+
auto row_str = GENERATE(as<std::string> {},
263+
// Eww brother what is that...
264+
"A,B,C\r\r\n" // Header row
265+
"123,234,345\r\r\n"
266+
"1,2,3\r\r\n"
267+
"4,5,6",
268+
269+
// Doubled-up Windows style
270+
"A,B,C\r\n\r\n" // Header row
271+
"123,234,345\r\n\r\n"
272+
"1,2,3\r\n\r\n"
273+
"4,5,6"
274+
);
275+
276+
SECTION("KEEP policy - all rows including blanks") {
277+
CSVFormat format;
278+
format.header_row(0).variable_columns(VariableColumnPolicy::KEEP);
279+
auto rows = parse(row_str, format);
280+
281+
CSVRow row;
282+
283+
// Blank line from the trailing \r\n after the header
284+
rows.read_row(row);
285+
REQUIRE(row.empty());
286+
287+
rows.read_row(row);
288+
vector<string> first_row = { "123", "234", "345" };
289+
REQUIRE(vector<string>(row) == first_row);
290+
REQUIRE(row["A"] == "123");
291+
REQUIRE(row["B"] == "234");
292+
REQUIRE(row["C"] == "345");
293+
294+
// Blank line
295+
rows.read_row(row);
296+
REQUIRE(row.empty());
297+
298+
rows.read_row(row);
299+
vector<string> second_row = { "1", "2", "3" };
300+
REQUIRE(vector<string>(row) == second_row);
301+
302+
// Blank line
303+
rows.read_row(row);
304+
REQUIRE(row.empty());
305+
306+
rows.read_row(row);
307+
vector<string> third_row = { "4", "5", "6" };
308+
REQUIRE(vector<string>(row) == third_row);
309+
310+
REQUIRE(rows.n_rows() == 6);
311+
}
312+
313+
SECTION("KEEP_NON_EMPTY policy - skip blank rows") {
314+
CSVFormat format;
315+
format.header_row(0).variable_columns(VariableColumnPolicy::KEEP_NON_EMPTY);
316+
auto rows = parse(row_str, format);
317+
318+
CSVRow row;
319+
320+
rows.read_row(row);
321+
vector<string> first_row = { "123", "234", "345" };
322+
REQUIRE(vector<string>(row) == first_row);
323+
REQUIRE(row["A"] == "123");
324+
REQUIRE(row["B"] == "234");
325+
REQUIRE(row["C"] == "345");
326+
327+
rows.read_row(row);
328+
vector<string> second_row = { "1", "2", "3" };
329+
REQUIRE(vector<string>(row) == second_row);
330+
331+
rows.read_row(row);
332+
vector<string> third_row = { "4", "5", "6" };
333+
REQUIRE(vector<string>(row) == third_row);
334+
335+
REQUIRE(rows.n_rows() == 3);
336+
}
337+
}
338+
238339
TEST_CASE("Test Whitespace Trimming", "[read_csv_trim]") {
239340
auto row_str = GENERATE(as<std::string> {},
240341
"A,B,C\r\n" // Header row

0 commit comments

Comments
 (0)