Skip to content

Commit 3c5be3d

Browse files
committed
Add case-insensitive column lookup
1 parent bb3f5a3 commit 3c5be3d

8 files changed

Lines changed: 263 additions & 9 deletions

File tree

README.md

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ CSVReader sstream_reader(my_csv, format);
273273
```
274274

275275
### Indexing by Column Names
276-
Retrieving values using a column name string is a cheap, constant time operation.
276+
Retrieving values using a column name string is a cheap, constant time operation with `EXACT` matching; with `CASE_INSENSITIVE`, the key is normalized before lookup.
277277

278278
```cpp
279279
# include "csv.hpp"
@@ -282,7 +282,12 @@ using namespace csv;
282282

283283
...
284284

285-
CSVReader reader("very_big_file.csv");
285+
// Optional: pass in a format to customize lookup behavior
286+
// Defaults to EXACT matching
287+
CSVFormat format;
288+
format.column_names_policy(ColumnNamePolicy::CASE_INSENSITIVE);
289+
290+
CSVReader reader("very_big_file.csv", format);
286291
double sum = 0;
287292

288293
for (auto& row: reader) {

include/csv.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
CSV for C++, version 3.0.0
2+
CSV for C++, version 3.1.0
33
https://github.com/vincentlaucsb/csv-parser
44
55
MIT License

include/internal/col_names.cpp

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#include <algorithm>
2+
#include <cctype>
13
#include "col_names.hpp"
24

35
namespace csv {
@@ -8,20 +10,44 @@ namespace csv {
810

911
CSV_INLINE void ColNames::set_col_names(const std::vector<std::string>& cnames) {
1012
this->col_names = cnames;
13+
this->col_pos.clear();
1114

1215
for (size_t i = 0; i < cnames.size(); i++) {
13-
this->col_pos[cnames[i]] = i;
16+
if (this->_policy == csv::ColumnNamePolicy::CASE_INSENSITIVE) {
17+
// For case-insensitive lookup, cache a lowercase version
18+
// of the column name in the map
19+
std::string lower(cnames[i]);
20+
std::transform(lower.begin(), lower.end(), lower.begin(),
21+
[](unsigned char c) { return static_cast<char>(std::tolower(c)); });
22+
this->col_pos[lower] = i;
23+
} else {
24+
this->col_pos[cnames[i]] = i;
25+
}
1426
}
1527
}
1628

1729
CSV_INLINE int ColNames::index_of(csv::string_view col_name) const {
30+
if (this->_policy == csv::ColumnNamePolicy::CASE_INSENSITIVE) {
31+
std::string lower(col_name);
32+
std::transform(lower.begin(), lower.end(), lower.begin(),
33+
[](unsigned char c) { return static_cast<char>(std::tolower(c)); });
34+
auto pos = this->col_pos.find(lower);
35+
if (pos != this->col_pos.end())
36+
return (int)pos->second;
37+
return CSV_NOT_FOUND;
38+
}
39+
1840
auto pos = this->col_pos.find(col_name.data());
1941
if (pos != this->col_pos.end())
2042
return (int)pos->second;
2143

2244
return CSV_NOT_FOUND;
2345
}
2446

47+
CSV_INLINE void ColNames::set_policy(csv::ColumnNamePolicy policy) {
48+
this->_policy = policy;
49+
}
50+
2551
CSV_INLINE size_t ColNames::size() const noexcept {
2652
return this->col_names.size();
2753
}

include/internal/col_names.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <vector>
77

88
#include "common.hpp"
9+
#include "csv_format.hpp"
910

1011
namespace csv {
1112
namespace internals {
@@ -30,6 +31,11 @@ namespace csv {
3031
void set_col_names(const std::vector<std::string>&);
3132
int index_of(csv::string_view) const;
3233

34+
/** Sets the column name lookup policy.
35+
* Must be called before set_col_names() for CI policy to take effect.
36+
*/
37+
void set_policy(csv::ColumnNamePolicy policy);
38+
3339
bool empty() const noexcept { return this->col_names.empty(); }
3440
size_t size() const noexcept;
3541

@@ -39,6 +45,7 @@ namespace csv {
3945
private:
4046
std::vector<std::string> col_names;
4147
std::unordered_map<std::string, size_t> col_pos;
48+
csv::ColumnNamePolicy _policy = csv::ColumnNamePolicy::EXACT;
4249
};
4350
}
4451
}

include/internal/csv_format.hpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,12 @@ namespace csv {
2424
KEEP = 1
2525
};
2626

27+
/** Determines how column name lookups are performed */
28+
enum class ColumnNamePolicy {
29+
EXACT = 0, /**< Case-sensitive match (default) */
30+
CASE_INSENSITIVE = 1 /**< Case-insensitive match */
31+
};
32+
2733
/** Stores the inferred format of a CSV file. */
2834
struct CSVGuessResult {
2935
char delim;
@@ -104,6 +110,17 @@ namespace csv {
104110
return *this;
105111
}
106112

113+
/** Sets the column name lookup policy.
114+
*
115+
* @param[in] policy Use ColumnNamePolicy::CASE_INSENSITIVE to allow
116+
* case-insensitive column lookups via CSVRow::operator[]
117+
* and CSVReader::index_of().
118+
*/
119+
CONSTEXPR_14 CSVFormat& column_names_policy(ColumnNamePolicy policy) {
120+
this->_column_name_policy = policy;
121+
return *this;
122+
}
123+
107124
/** Sets the chunk size used when reading the CSV
108125
*
109126
* @param[in] size Chunk size in bytes (minimum: 10MB = ITERATION_CHUNK_SIZE)
@@ -131,6 +148,7 @@ namespace csv {
131148
std::vector<char> get_possible_delims() const { return this->possible_delimiters; }
132149
std::vector<char> get_trim_chars() const { return this->trim_chars; }
133150
CONSTEXPR VariableColumnPolicy get_variable_column_policy() const { return this->variable_column_policy; }
151+
CONSTEXPR ColumnNamePolicy get_column_name_policy() const { return this->_column_name_policy; }
134152
CONSTEXPR size_t get_chunk_size() const { return this->_chunk_size; }
135153
#endif
136154

@@ -176,6 +194,9 @@ namespace csv {
176194
/**< Allow variable length columns? */
177195
VariableColumnPolicy variable_column_policy = VariableColumnPolicy::IGNORE_ROW;
178196

197+
/**< Column name lookup policy */
198+
ColumnNamePolicy _column_name_policy = ColumnNamePolicy::EXACT;
199+
179200
/**< Chunk size for reading; passed to CSVReader at construction time */
180201
size_t _chunk_size = internals::ITERATION_CHUNK_SIZE;
181202
};

include/internal/csv_reader.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -225,11 +225,7 @@ namespace csv {
225225
* csv::CSV_NOT_FOUND otherwise.
226226
*/
227227
CSV_INLINE int CSVReader::index_of(csv::string_view col_name) const {
228-
auto _col_names = this->get_col_names();
229-
for (size_t i = 0; i < _col_names.size(); i++)
230-
if (_col_names[i] == col_name) return (int)i;
231-
232-
return CSV_NOT_FOUND;
228+
return this->col_names->index_of(col_name);
233229
}
234230

235231
CSV_INLINE void CSVReader::trim_header() {
@@ -252,6 +248,7 @@ namespace csv {
252248
*/
253249
CSV_INLINE void CSVReader::set_col_names(const std::vector<std::string>& names)
254250
{
251+
this->col_names->set_policy(this->_format.get_column_name_policy());
255252
this->col_names->set_col_names(names);
256253
this->n_cols = names.size();
257254
}

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ target_sources(csv_test
2323
PRIVATE
2424
${CSV_INCLUDE_DIR}/csv.hpp
2525
main.cpp
26+
test_col_names.cpp
2627
test_csv_delimiter.cpp
2728
test_csv_field.cpp
2829
test_csv_field_array.cpp

tests/test_col_names.cpp

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
/** @file
2+
* Unit tests for csv::internals::ColNames
3+
*/
4+
5+
#include <catch2/catch_all.hpp>
6+
#include "internal/col_names.hpp"
7+
8+
using namespace csv;
9+
using namespace csv::internals;
10+
11+
// ============================================================
12+
// Default (exact / case-sensitive) policy
13+
// ============================================================
14+
15+
TEST_CASE("ColNames - empty on construction", "[col_names]") {
16+
ColNames cn;
17+
REQUIRE(cn.empty());
18+
REQUIRE(cn.size() == 0);
19+
}
20+
21+
TEST_CASE("ColNames - set and get column names", "[col_names]") {
22+
ColNames cn;
23+
cn.set_col_names({"A", "B", "C"});
24+
25+
REQUIRE(cn.size() == 3);
26+
REQUIRE_FALSE(cn.empty());
27+
28+
auto names = cn.get_col_names();
29+
REQUIRE(names.size() == 3);
30+
REQUIRE(names[0] == "A");
31+
REQUIRE(names[1] == "B");
32+
REQUIRE(names[2] == "C");
33+
}
34+
35+
TEST_CASE("ColNames - constructor with names", "[col_names]") {
36+
ColNames cn({"X", "Y", "Z"});
37+
REQUIRE(cn.size() == 3);
38+
REQUIRE(cn.get_col_names() == std::vector<std::string>{"X", "Y", "Z"});
39+
}
40+
41+
TEST_CASE("ColNames - index_of (exact policy)", "[col_names]") {
42+
ColNames cn({"Name", "Age", "City"});
43+
44+
REQUIRE(cn.index_of("Name") == 0);
45+
REQUIRE(cn.index_of("Age") == 1);
46+
REQUIRE(cn.index_of("City") == 2);
47+
}
48+
49+
TEST_CASE("ColNames - index_of returns CSV_NOT_FOUND for missing column (exact)", "[col_names]") {
50+
ColNames cn({"Name", "Age"});
51+
52+
REQUIRE(cn.index_of("missing") == CSV_NOT_FOUND);
53+
REQUIRE(cn.index_of("") == CSV_NOT_FOUND);
54+
}
55+
56+
TEST_CASE("ColNames - exact policy is case-sensitive", "[col_names]") {
57+
ColNames cn({"Name", "Age"});
58+
59+
// Different case must not match under EXACT policy
60+
REQUIRE(cn.index_of("name") == CSV_NOT_FOUND);
61+
REQUIRE(cn.index_of("NAME") == CSV_NOT_FOUND);
62+
REQUIRE(cn.index_of("AGE") == CSV_NOT_FOUND);
63+
}
64+
65+
TEST_CASE("ColNames - operator[] by index", "[col_names]") {
66+
ColNames cn({"First", "Second", "Third"});
67+
68+
REQUIRE(cn[0] == "First");
69+
REQUIRE(cn[1] == "Second");
70+
REQUIRE(cn[2] == "Third");
71+
}
72+
73+
TEST_CASE("ColNames - operator[] throws on out-of-bounds", "[col_names]") {
74+
ColNames cn({"A", "B"});
75+
REQUIRE_THROWS_AS(cn[2], std::out_of_range);
76+
REQUIRE_THROWS_AS(cn[100], std::out_of_range);
77+
}
78+
79+
TEST_CASE("ColNames - operator[] throws on empty ColNames", "[col_names]") {
80+
ColNames cn;
81+
REQUIRE_THROWS_AS(cn[0], std::out_of_range);
82+
}
83+
84+
TEST_CASE("ColNames - set_col_names replaces existing names", "[col_names]") {
85+
ColNames cn({"Old1", "Old2"});
86+
cn.set_col_names({"New1", "New2", "New3"});
87+
88+
REQUIRE(cn.size() == 3);
89+
REQUIRE(cn.index_of("New1") == 0);
90+
REQUIRE(cn.index_of("Old1") == CSV_NOT_FOUND);
91+
}
92+
93+
// ============================================================
94+
// Case-insensitive policy
95+
// ============================================================
96+
97+
TEST_CASE("ColNames - case-insensitive index_of: lowercase query", "[col_names][case_insensitive]") {
98+
ColNames cn;
99+
cn.set_policy(ColumnNamePolicy::CASE_INSENSITIVE);
100+
cn.set_col_names({"Name", "Age", "City"});
101+
102+
REQUIRE(cn.index_of("name") == 0);
103+
REQUIRE(cn.index_of("age") == 1);
104+
REQUIRE(cn.index_of("city") == 2);
105+
}
106+
107+
TEST_CASE("ColNames - case-insensitive index_of: uppercase query", "[col_names][case_insensitive]") {
108+
ColNames cn;
109+
cn.set_policy(ColumnNamePolicy::CASE_INSENSITIVE);
110+
cn.set_col_names({"Name", "Age", "City"});
111+
112+
REQUIRE(cn.index_of("NAME") == 0);
113+
REQUIRE(cn.index_of("AGE") == 1);
114+
REQUIRE(cn.index_of("CITY") == 2);
115+
}
116+
117+
TEST_CASE("ColNames - case-insensitive index_of: exact query still works", "[col_names][case_insensitive]") {
118+
ColNames cn;
119+
cn.set_policy(ColumnNamePolicy::CASE_INSENSITIVE);
120+
cn.set_col_names({"Name", "Age", "City"});
121+
122+
REQUIRE(cn.index_of("Name") == 0);
123+
REQUIRE(cn.index_of("Age") == 1);
124+
REQUIRE(cn.index_of("City") == 2);
125+
}
126+
127+
TEST_CASE("ColNames - case-insensitive missing column returns CSV_NOT_FOUND", "[col_names][case_insensitive]") {
128+
ColNames cn;
129+
cn.set_policy(ColumnNamePolicy::CASE_INSENSITIVE);
130+
cn.set_col_names({"Name", "Age"});
131+
132+
REQUIRE(cn.index_of("missing") == CSV_NOT_FOUND);
133+
REQUIRE(cn.index_of("") == CSV_NOT_FOUND);
134+
}
135+
136+
TEST_CASE("ColNames - case-insensitive get_col_names preserves original casing", "[col_names][case_insensitive]") {
137+
// The stored names should be in their original form even under CI policy.
138+
// The lowercase transform is internal to the lookup map only.
139+
ColNames cn;
140+
cn.set_policy(ColumnNamePolicy::CASE_INSENSITIVE);
141+
cn.set_col_names({"ReportDt", "Unit", "Power"});
142+
143+
auto names = cn.get_col_names();
144+
REQUIRE(names[0] == "ReportDt");
145+
REQUIRE(names[1] == "Unit");
146+
REQUIRE(names[2] == "Power");
147+
}
148+
149+
TEST_CASE("ColNames - case-insensitive operator[] preserves original casing", "[col_names][case_insensitive]") {
150+
ColNames cn;
151+
cn.set_policy(ColumnNamePolicy::CASE_INSENSITIVE);
152+
cn.set_col_names({"ReportDt", "Unit"});
153+
154+
REQUIRE(cn[0] == "ReportDt");
155+
REQUIRE(cn[1] == "Unit");
156+
}
157+
158+
TEST_CASE("ColNames - policy must be set before set_col_names to take effect", "[col_names][case_insensitive]") {
159+
// set_col_names called BEFORE set_policy: map is built with exact keys,
160+
// so CI lookup will not work.
161+
ColNames cn({"Name", "Age"}); // policy is EXACT at this point
162+
cn.set_policy(ColumnNamePolicy::CASE_INSENSITIVE);
163+
164+
// The map was built with exact keys so lowercase query won't find anything.
165+
REQUIRE(cn.index_of("name") == CSV_NOT_FOUND);
166+
167+
// After rebuilding the map, CI works.
168+
cn.set_col_names(cn.get_col_names());
169+
REQUIRE(cn.index_of("name") == 0);
170+
}
171+
172+
// ============================================================
173+
// Edge cases
174+
// ============================================================
175+
176+
TEST_CASE("ColNames - empty column name list", "[col_names][edge_cases]") {
177+
ColNames cn;
178+
cn.set_col_names({});
179+
180+
REQUIRE(cn.empty());
181+
REQUIRE(cn.size() == 0);
182+
REQUIRE(cn.index_of("anything") == CSV_NOT_FOUND);
183+
}
184+
185+
TEST_CASE("ColNames - column name that is an empty string", "[col_names][edge_cases]") {
186+
ColNames cn({"", "B", "C"});
187+
REQUIRE(cn.index_of("") == 0);
188+
REQUIRE(cn.index_of("B") == 1);
189+
}
190+
191+
TEST_CASE("ColNames - duplicate column names: last index wins", "[col_names][edge_cases]") {
192+
// When header has duplicate names the last occurrence wins in the hash map.
193+
// This documents current behavior rather than prescribing it.
194+
ColNames cn({"dup", "other", "dup"});
195+
REQUIRE(cn.index_of("dup") == 2);
196+
REQUIRE(cn.index_of("other") == 1);
197+
}

0 commit comments

Comments
 (0)