diff --git a/.github/workflows/cmake-multi-platform.yml b/.github/workflows/cmake-multi-platform.yml index 2b12dc6..1547aa4 100644 --- a/.github/workflows/cmake-multi-platform.yml +++ b/.github/workflows/cmake-multi-platform.yml @@ -8,6 +8,9 @@ on: pull_request: branches: [ "master" ] +permissions: + contents: read + jobs: build: runs-on: ${{ matrix.os }} @@ -47,12 +50,12 @@ jobs: steps: - name: Checkout repository and submodules - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: submodules: recursive - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.x' @@ -93,3 +96,4 @@ jobs: run: ctest --build-config ${{ matrix.build_type }} + diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 0e6749a..c347de5 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -24,7 +24,7 @@ jobs: steps: - name: Checkout repository and submodules - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: submodules: recursive diff --git a/.github/workflows/compat-edge-cases.yml b/.github/workflows/compat-edge-cases.yml new file mode 100644 index 0000000..e4eec4a --- /dev/null +++ b/.github/workflows/compat-edge-cases.yml @@ -0,0 +1,103 @@ +name: Compatibility Edge Cases + +permissions: + contents: read + +on: + push: + branches: [ "master" ] + paths: + - "include/**" + - "single_header.py" + - "CMakeLists.txt" + - ".github/workflows/compat-edge-cases.yml" + pull_request: + branches: [ "master" ] + paths: + - "include/**" + - "single_header.py" + - "CMakeLists.txt" + - ".github/workflows/compat-edge-cases.yml" + workflow_dispatch: + +jobs: + msvc-without-zc-cplusplus: + name: MSVC without /Zc:__cplusplus + runs-on: windows-latest + + steps: + - name: Checkout repository and submodules + uses: actions/checkout@v5 + with: + submodules: recursive + + - name: Set up MSVC developer command prompt + uses: ilammy/msvc-dev-cmd@v1 + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: '3.x' + + - name: Configure CMake (for single-header generation) + run: > + cmake -S ${{ github.workspace }} + -B ${{ github.workspace }}/build/no-zc-cplusplus + -DCSV_CXX_STANDARD=20 + -DCSV_BUILD_PROGRAMS=OFF + -DBUILD_PYTHON=OFF + + - name: Generate amalgamated single-header + run: cmake --build ${{ github.workspace }}/build/no-zc-cplusplus --config Release --target generate_single_header + + - name: Compile generated single-header without /Zc:__cplusplus + shell: cmd + run: | + REM NOTE: single_include/csv.hpp is an intentional compatibility shim; compile the generated amalgamated header instead. + echo #include "csv.hpp" > ci_no_zc_single_header.cpp + echo int main() { return 0; } >> ci_no_zc_single_header.cpp + cl /nologo /std:c++20 /EHsc /Zc:__cplusplus- /I build\no-zc-cplusplus\single_include_generated /c ci_no_zc_single_header.cpp + + - name: Compile unamalgamated header without /Zc:__cplusplus + shell: cmd + run: | + echo #include "csv.hpp" > ci_no_zc_unamalgamated.cpp + echo int main() { return 0; } >> ci_no_zc_unamalgamated.cpp + cl /nologo /std:c++20 /EHsc /Zc:__cplusplus- /I include /c ci_no_zc_unamalgamated.cpp + + mingw-minimal: + name: MinGW-w64 minimal build and test (C++17) + runs-on: windows-latest + + steps: + - name: Checkout repository and submodules + uses: actions/checkout@v5 + with: + submodules: recursive + + - name: Set up MSYS2 (MinGW64) + uses: msys2/setup-msys2@v2 + with: + msystem: MINGW64 + update: true + install: >- + mingw-w64-x86_64-gcc + mingw-w64-x86_64-cmake + mingw-w64-x86_64-ninja + + - name: Configure CMake (MinGW, C++17) + shell: msys2 {0} + run: > + cmake -S . -B build/mingw-minimal -G Ninja + -DCMAKE_BUILD_TYPE=Release + -DCSV_CXX_STANDARD=17 + -DCSV_BUILD_PROGRAMS=OFF + -DBUILD_PYTHON=OFF + + - name: Build + shell: msys2 {0} + run: cmake --build build/mingw-minimal + + - name: Test + shell: msys2 {0} + run: ctest --test-dir build/mingw-minimal --output-on-failure diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 20d985c..15b5894 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -16,7 +16,7 @@ jobs: steps: - name: Checkout repository and submodules - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: submodules: recursive diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 4476428..3b1c470 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -21,7 +21,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: submodules: recursive @@ -34,7 +34,7 @@ jobs: run: doxygen Doxyfile - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.x' diff --git a/.github/workflows/release-single-header.yml b/.github/workflows/release-single-header.yml new file mode 100644 index 0000000..80bd896 --- /dev/null +++ b/.github/workflows/release-single-header.yml @@ -0,0 +1,45 @@ +name: Release Single Header Asset + +on: + push: + tags: + - "v*" + workflow_dispatch: + +permissions: + contents: write + +jobs: + publish-single-header: + name: Generate and upload csv.hpp release asset + runs-on: ubuntu-latest + + steps: + - name: Checkout repository and submodules + uses: actions/checkout@v5 + with: + submodules: recursive + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: "3.x" + + - name: Generate single header + run: | + mkdir -p release-assets + python single_header.py release-assets/csv.hpp + + - name: Generate SHA256 checksum + run: | + cd release-assets + sha256sum csv.hpp > csv.hpp.sha256 + + - name: Upload release assets + uses: softprops/action-gh-release@v2 + with: + files: | + release-assets/csv.hpp + release-assets/csv.hpp.sha256 + fail_on_unmatched_files: true + generate_release_notes: true diff --git a/.github/workflows/sanitizers.yml b/.github/workflows/sanitizers.yml index a8dcd4e..e76e7f1 100644 --- a/.github/workflows/sanitizers.yml +++ b/.github/workflows/sanitizers.yml @@ -6,6 +6,9 @@ on: pull_request: branches: [ "master" ] +permissions: + contents: read + jobs: sanitizers: runs-on: ubuntu-latest @@ -33,7 +36,7 @@ jobs: steps: - name: Checkout repository and submodules - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: submodules: recursive @@ -74,7 +77,7 @@ jobs: steps: - name: Checkout repository and submodules - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: submodules: recursive @@ -117,7 +120,7 @@ jobs: steps: - name: Checkout repository and submodules - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: submodules: recursive diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..8df55f3 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,77 @@ +# CSV Parser - AI Agent Context + +Architectural overview for AI assistants working with this codebase. + +> **Maintenance rule:** Whenever this file is changed, `CLAUDE.md` in the same directory must be updated to reflect the changes. `CLAUDE.md` is a bullet-point summary of this file and must stay in sync. + +## Critical: single_include/csv.hpp Is A Shim + +`single_include/csv.hpp` is intentionally **non-functional** and exists only as a compatibility shim. + +- Do **not** compile against `single_include/csv.hpp` +- For single-header validation, generate `build/.../single_include_generated/csv.hpp` via the `generate_single_header` target, then compile that generated file +- For unamalgamated usage, include headers from `include/` + +This guard exists to prevent stale-in-repo amalgamated headers and to force use of the canonical generated distribution. + +## Critical: Two Independent Code Paths + +The `CSVReader` class has **two completely different implementations**: + +```cpp +// PATH 1: Memory-mapped I/O (MmapParser) +CSVReader reader("filename.csv"); + +// PATH 2: Stream-based (StreamParser) +std::ifstream infile("filename.csv", std::ios::binary); +CSVReader reader(infile, format); +``` + +**Impact:** Bugs can exist in one path but not the other (see issue #281). Any test validating parsing behavior must test BOTH paths using Catch2 `SECTION`. + +## Threading: Worker + 10MB Chunks + +- Worker thread reads in 10MB chunks (`ITERATION_CHUNK_SIZE`) +- Communicates via `ThreadSafeDeque` +- Exceptions propagate via `std::exception_ptr` +- Critical: Fields spanning chunk boundaries must not corrupt + +**Testing requirement:** Use ≥500K rows to cross 10MB boundary. + +## Key Files + +| File | Contains | +|------|----------| +| `csv_reader.hpp` | Mmap vs stream constructors | +| `csv_reader.cpp` | Delimiter guessing, header detection | +| `basic_csv_parser.hpp` | Parser base class (IBasicCSVParser, MmapParser, StreamParser) | +| `basic_csv_parser.cpp` | Chunk transitions, worker thread | +| `raw_csv_data.hpp` | Internal parser data structures (RawCSVField, CSVFieldList, RawCSVData) | +| `thread_safe_deque.hpp` | Producer-consumer queue for parser→main thread communication | +| `csv_row.hpp` | Public API types (CSVField, CSVRow) | +| `test_round_trip.cpp` | Exemplar test patterns | + +## Data Flow: Parser → Row API + +``` +Parser Thread Main Thread + ↓ ↓ +RawCSVData (shared_ptr) ─────────────→ CSVRow + ↓ ↓ +CSVFieldList → RawCSVField[] CSVField (lazy unescaping) + ↓ +ThreadSafeDeque +(producer-consumer queue) +``` + +**Thread Safety:** Parser populates `RawCSVData`, pushes `CSVRow` to `ThreadSafeDeque`, main thread pops and reads. The `CSVFieldList` uses chunked allocation (~170 fields/chunk) for cache locality. See `raw_csv_data.hpp` and `thread_safe_deque.hpp` for implementation details. + +## Common Pitfalls + +1. **Don't assume one code path:** Mmap and stream paths are different. Always test both. +2. **Don't write tiny tests:** Need ≥500K rows to cross 10MB chunk boundary. +3. **Don't use uniform values:** Each column needs distinct values to detect corruption. +4. **Don't ignore async:** Worker thread means exceptions must use `exception_ptr`. +5. **Don't change one constructor:** Likely affects both mmap and stream paths. + +See `tests/AGENTS.md` for test strategy, checklist, and conventions. diff --git a/CLAUDE.md b/CLAUDE.md index 98e7c1a..58eff52 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,94 +1,37 @@ -# CSV Parser - AI Agent Context +# CSV Parser - Claude Summary -Architectural overview for AI assistants working with this codebase. +> **`AGENTS.md` is the source of truth.** This file is a bullet-point summary only. Always load and follow `AGENTS.md` — it takes precedence over anything here. -## Critical: Two Independent Code Paths +## single_include/csv.hpp +- Non-functional shim — do **not** compile against it +- For single-header use: generate `build/.../single_include_generated/csv.hpp` via `generate_single_header` target +- For unamalgamated use: include from `include/` -The `CSVReader` class has **two completely different implementations**: +## Two Independent Code Paths +- `CSVReader("file.csv")` → MmapParser +- `CSVReader(istream, format)` → StreamParser +- Bugs can exist in one and not the other — always test both with Catch2 `SECTION` -```cpp -// PATH 1: Memory-mapped I/O (MmapParser) -CSVReader reader("filename.csv"); - -// PATH 2: Stream-based (StreamParser) -std::ifstream infile("filename.csv", std::ios::binary); -CSVReader reader(infile, format); -``` - -**Impact:** Bugs can exist in one path but not the other (see issue #281). Any test validating parsing behavior must test BOTH paths using Catch2 `SECTION`. - -## Threading: Worker + 10MB Chunks - -- Worker thread reads in 10MB chunks (`ITERATION_CHUNK_SIZE`) -- Communicates via `ThreadSafeDeque` +## Threading +- Worker thread reads 10MB chunks (`ITERATION_CHUNK_SIZE`) +- Communication via `ThreadSafeDeque` - Exceptions propagate via `std::exception_ptr` -- Critical: Fields spanning chunk boundaries must not corrupt - -**Testing requirement:** Use ≥500K rows to cross 10MB boundary. - -## Test Strategy: Use Distinct Column Values - -❌ **BAD:** `array{i, i, i, i, i}` - All columns identical -✅ **GOOD:** `array{i*5+0, i*5+1, i*5+2, i*5+3, i*5+4}` - Each column distinct - -**Why:** Field corruption is only detectable if columns have different values. +- Tests must use ≥500K rows to cross chunk boundary ## Key Files - -| File | Contains | -|------|----------| -| `csv_reader.hpp` | Mmap vs stream constructors | -| `csv_reader.cpp` | Delimiter guessing, header detection | -| `basic_csv_parser.hpp` | Parser base class (IBasicCSVParser, MmapParser, StreamParser) | -| `basic_csv_parser.cpp` | Chunk transitions, worker thread | -| `raw_csv_data.hpp` | Internal parser data structures (RawCSVField, CSVFieldList, RawCSVData) | -| `thread_safe_deque.hpp` | Producer-consumer queue for parser→main thread communication | -| `csv_row.hpp` | Public API types (CSVField, CSVRow) | -| `test_round_trip.cpp` | Exemplar test patterns | - -## Data Flow: Parser → Row API - -``` -Parser Thread Main Thread - ↓ ↓ -RawCSVData (shared_ptr) ─────────────→ CSVRow - ↓ ↓ -CSVFieldList → RawCSVField[] CSVField (lazy unescaping) - ↓ -ThreadSafeDeque -(producer-consumer queue) -``` - -**Thread Safety:** Parser populates `RawCSVData`, pushes `CSVRow` to `ThreadSafeDeque`, main thread pops and reads. The `CSVFieldList` uses chunked allocation (~170 fields/chunk) for cache locality. See `raw_csv_data.hpp` and `thread_safe_deque.hpp` for implementation details. +- `csv_reader.hpp` — mmap vs stream constructors +- `basic_csv_parser.hpp` — MmapParser, StreamParser implementations +- `basic_csv_parser.cpp` — chunk transitions, worker thread +- `raw_csv_data.hpp` — RawCSVField, CSVFieldList, RawCSVData +- `thread_safe_deque.hpp` — producer-consumer queue +- `csv_row.hpp` — CSVField, CSVRow public API ## Common Pitfalls - -1. **Don't assume one code path:** Mmap and stream paths are different. Always test both. -2. **Don't write tiny tests:** Need ≥500K rows to cross 10MB chunk boundary. -3. **Don't use uniform values:** Each column needs distinct values to detect corruption. -4. **Don't ignore async:** Worker thread means exceptions must use `exception_ptr`. -5. **Don't change one constructor:** Likely affects both mmap and stream paths. - -## Test Checklist - -- [ ] Tests both mmap and stream paths (use `SECTION`) -- [ ] Distinct values per column -- [ ] ≥500K rows to cross chunk boundary -- [ ] Documents bug it would catch -- [ ] Lambda + SECTION pattern for code reuse -- [ ] Test data in `tests/data/fake_data` (real data in `tests/data/real_data`) -- [ ] Use `FileGuard` for temporary files (ensures cleanup even if test fails) - -**Note:** `tests/data` is a git submodule. Remember to commit changes separately. - -## Recent Bug Fixes - -| Issue | Bug | Fixed | -|-------|-----|-------| -| #278 | CSVFieldList move constructor dangling pointer | Feb 2026 | -| #280 | Field corruption at chunk boundaries | PR #282 | -| #281 | Stream-specific exception handling | PR #282 | -| #283 | Header detection with variable-width rows | Jan 2026 | -| #285 | Delimiter guessing overwrites `no_header()` | Feb 2026 | - -See inline comments in source files for implementation details. +- Always test both mmap and stream paths +- ≥500K rows needed to cross 10MB boundary +- Use distinct column values to detect field corruption +- Exceptions from worker thread need `exception_ptr` +- Changes to one constructor likely affect both paths + +## Tests +See `tests/AGENTS.md` for full test strategy, checklist, and conventions. diff --git a/README.md b/README.md index 10785ae..e9b3f5a 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,7 @@ It does not try to decode UTF-8, except for detecting and stripping UTF-8 byte o ### Well Tested This CSV parser has: * An extensive Catch2 test suite + * Tests of various CMake and non-CMake builds across g++, clang, MSVC, and MinGW * Address, thread safety, and undefined behavior checks with ASan, TSan, and Valgrind (see [GitHub Actions](https://github.com/vincentlaucsb/csv-parser/actions)) #### Bug Reports @@ -93,6 +94,10 @@ Found a bug? Please report it! This project welcomes **genuine bug reports broug * ✅ Performance regressions in real-world scenarios * ✅ API issues that affect **practical, real-world use cases** +When reporting integration or compiler issues, please state which library form you are using: + * Single-header + * Unamalgamated headers/library (`include/` with your own build system, CMake, etc.) + Please keep reports grounded in real use cases—no contrived edge cases or philosophical debates about API design, thanks! **Design Note:** `CSVReader` uses `std::input_iterator_tag` for single-pass streaming of arbitrarily large files. If you need multi-pass iteration or random access, copy rows to a `std::vector` first. This is by design, not a bug. @@ -117,6 +122,8 @@ All of the code required to build this library, aside from the C++ standard libr While C++17 is recommended, C++11 is the minimum version required. This library makes extensive use of string views, and uses [Martin Moene's string view library](https://github.com/martinmoene/string-view-lite) if `std::string_view` is not available. +This library requires C++ exceptions to be enabled (for example, do not compile with `-fno-exceptions`). + ### Single Header **[📥 Download csv.hpp](https://vincentlaucsb.github.io/csv-parser/csv.hpp)** — Available on GitHub Pages diff --git a/docs/source/Doxy.md b/docs/source/Doxy.md index f8c4220..3ab2b65 100644 --- a/docs/source/Doxy.md +++ b/docs/source/Doxy.md @@ -97,6 +97,9 @@ column extraction, editing, and grouping. * csv::CSVStat::get_col_names() ### CSV Writing +The [CSV Writing Guide](\ref md_docs_2source_2csv__writing) contains a +high-level overview of writing CSVs. + * csv::make_csv_writer(): Construct a csv::CSVWriter * csv::make_tsv_writer(): Construct a csv::TSVWriter * csv::DelimWriter diff --git a/docs/source/csv_writing.md b/docs/source/csv_writing.md new file mode 100644 index 0000000..a349f87 --- /dev/null +++ b/docs/source/csv_writing.md @@ -0,0 +1,41 @@ +# CSV Writing Guide + +This page summarizes write-side APIs and practical usage patterns for emitting +CSV/TSV data. + +## Core Writer APIs + +* `csv::make_csv_writer()` +* `csv::make_tsv_writer()` +* `csv::DelimWriter` + +Use `csv::make_csv_writer()` for comma-delimited output and +`csv::make_tsv_writer()` for tab-delimited output. + +## Writing Containers with `operator<<` + +Any row-like container of string-convertible values can be streamed directly. + +\snippet tests/test_write_csv.cpp CSV Writer Example + +## Writing Tuples and Custom Types + +`DelimWriter` can also serialize tuples and custom types that provide a string +conversion. + +\snippet tests/test_write_csv.cpp CSV Writer Tuple Example + +## Data Reordering Workflow + +For read-transform-write pipelines, `csv::CSVRow` supports conversion to +`std::vector`, which makes it straightforward to reorder/select +fields before writing. + +Typical flow: + +1. Read with `CSVReader` +2. Convert row to `std::vector` +3. Reorder/select fields +4. Emit with `CSVWriter` + +\snippet tests/test_write_csv.cpp CSV Reordering Example diff --git a/include/csv.hpp b/include/csv.hpp index 907c534..5b4bc83 100644 --- a/include/csv.hpp +++ b/include/csv.hpp @@ -1,5 +1,5 @@ /* -CSV for C++, version 2.5.1 +CSV for C++, version 2.5.2 https://github.com/vincentlaucsb/csv-parser MIT License diff --git a/include/external/mio.hpp b/include/external/mio.hpp index 95c696c..a42a0cb 100644 --- a/include/external/mio.hpp +++ b/include/external/mio.hpp @@ -18,6 +18,15 @@ * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + /* csv-parser local note: + * + * This vendored mio.hpp includes a minimal Windows-specific narrowing fix in + * int64_high/int64_low to avoid -Wconversion failures under strict MinGW builds. + * Keep this patch small and easy to rebase if/when upstream is updated. + * + * - Vincent La 3/31/2026 + */ + #ifndef MIO_MMAP_HEADER #define MIO_MMAP_HEADER @@ -785,13 +794,13 @@ namespace win { /** Returns the 4 upper bytes of an 8-byte integer. */ inline DWORD int64_high(int64_t n) noexcept { - return n >> 32; + return static_cast(static_cast(n) >> 32); } /** Returns the 4 lower bytes of an 8-byte integer. */ inline DWORD int64_low(int64_t n) noexcept { - return n & 0xffffffff; + return static_cast(static_cast(n) & 0xffffffffULL); } template< diff --git a/include/internal/common.hpp b/include/internal/common.hpp index 955f89c..f7668d8 100644 --- a/include/internal/common.hpp +++ b/include/internal/common.hpp @@ -54,16 +54,34 @@ #define CSV_UNREACHABLE() abort() #endif +// This library uses C++ exceptions for error reporting in public APIs. +#if defined(__cpp_exceptions) || defined(_CPPUNWIND) || defined(__EXCEPTIONS) + #define CSV_EXCEPTIONS_ENABLED 1 +#else + #define CSV_EXCEPTIONS_ENABLED 0 +#endif + +#if !CSV_EXCEPTIONS_ENABLED + #error "csv-parser requires C++ exceptions. Enable exception handling (for example, remove -fno-exceptions or use /EHsc)." +#endif + // Detect C++ standard version BEFORE namespace to properly include string_view -#if (defined(CMAKE_CXX_STANDARD) && CMAKE_CXX_STANDARD == 20) || __cplusplus >= 202002L +// MSVC: __cplusplus == 199711L unless /Zc:__cplusplus is set; use _MSVC_LANG instead. +#if defined(_MSVC_LANG) && _MSVC_LANG > __cplusplus +# define CSV_CPLUSPLUS _MSVC_LANG +#else +# define CSV_CPLUSPLUS __cplusplus +#endif + +#if (defined(CMAKE_CXX_STANDARD) && CMAKE_CXX_STANDARD == 20) || CSV_CPLUSPLUS >= 202002L #define CSV_HAS_CXX20 #endif -#if (defined(CMAKE_CXX_STANDARD) && CMAKE_CXX_STANDARD == 17) || __cplusplus >= 201703L +#if (defined(CMAKE_CXX_STANDARD) && CMAKE_CXX_STANDARD == 17) || CSV_CPLUSPLUS >= 201703L #define CSV_HAS_CXX17 #endif -#if (defined(CMAKE_CXX_STANDARD) && CMAKE_CXX_STANDARD >= 14) || __cplusplus >= 201402L +#if (defined(CMAKE_CXX_STANDARD) && CMAKE_CXX_STANDARD >= 14) || CSV_CPLUSPLUS >= 201402L #define CSV_HAS_CXX14 #endif diff --git a/include/internal/csv_row.hpp b/include/internal/csv_row.hpp index cb14a04..349b001 100644 --- a/include/internal/csv_row.hpp +++ b/include/internal/csv_row.hpp @@ -322,9 +322,13 @@ namespace csv { const std::vector& subset ) const; - /** Convert this CSVRow into a vector of strings. - * **Note**: This is a less efficient method of - * accessing data than using the [] operator. + /** Convert this row into a `std::vector`. + * + * This conversion is primarily intended for write-side workflows, such as + * reordering or selecting columns before forwarding the row to `CSVWriter`. + * + * @note This is less efficient than indexed access via `operator[]` because + * it materializes all fields as owning strings. */ operator std::vector() const; ///@} @@ -517,6 +521,11 @@ namespace csv { } } +/** Stream insertion helper for `CSVField`. + * + * Writes the textual field value to an output stream. This is mainly a convenience + * for logging/debug output and simple formatting pipelines. + */ inline std::ostream& operator << (std::ostream& os, csv::CSVField const& value) { os << std::string(value); return os; diff --git a/tests/AGENTS.md b/tests/AGENTS.md new file mode 100644 index 0000000..2948124 --- /dev/null +++ b/tests/AGENTS.md @@ -0,0 +1,206 @@ +# CSV Parser - Test Agent Context + +> **Maintenance rule:** Whenever this file is changed, `CLAUDE.md` in the same directory must be updated to reflect the changes. `CLAUDE.md` is a bullet-point summary of this file and must stay in sync. + +## Test Checklist + +- [ ] Tests both mmap and stream paths (use `SECTION`) +- [ ] Distinct values per column +- [ ] ≥500K rows to cross chunk boundary +- [ ] Documents bug it would catch +- [ ] Lambda + SECTION pattern for code reuse +- [ ] Test data in `tests/data/fake_data` (real data in `tests/data/real_data`) +- [ ] Use `FileGuard` for temporary files (ensures cleanup even if test fails) + +**Note:** `tests/data` is a git submodule. Remember to commit changes separately. + +## Test Strategy: Use Distinct Column Values + +❌ **BAD:** `array{i, i, i, i, i}` - All columns identical +✅ **GOOD:** `array{i*5+0, i*5+1, i*5+2, i*5+3, i*5+4}` - Each column distinct + +**Why:** Field corruption is only detectable if columns have different values. + +## Test Architecture + +### Framework +- **Catch2 v3.6.0**: Modern C++ testing framework with SECTION support for testing multiple code paths + +### Shared Test Utilities (`tests/shared/`) + +**Always check `tests/shared/` before implementing any test helper from scratch.** + +| File | Purpose | +|------|---------| +| `shared/file_guard.hpp` | RAII temp-file cleanup — **use this for every temp file** | +| `shared/float_test_cases.hpp` | Shared floating-point edge-case data | +| `shared/timeout_helper.hpp` | Timeout wrapper for race/stress tests to prevent hangs | + +#### FileGuard — RAII temp file cleanup + +> **AI helpers**: the class is called `FileGuard`, not `TempFile`, `ScopedFile`, or `TempCSVFile`. +> Include it as `#include "shared/file_guard.hpp"`. Never use raw `std::remove()`. + +```cpp +#include "shared/file_guard.hpp" + +TEST_CASE("My test") { + FileGuard cleanup("./tests/data/tmp_foo.csv"); // deleted on scope exit + { + std::ofstream out(cleanup.filename, std::ios::binary); + out << "A,B\n1,2\n"; + } + CSVReader reader(cleanup.filename); // mmap path + REQUIRE(...); +} // std::remove() called here even if REQUIRE throws +``` + +--- + +### Test Organization + +### Testing Conventions + +#### Tests Should Expose Bugs, Not Assert Them + +When writing a test for a known bug, assert correct behavior (even if it currently fails), not buggy behavior. + +Wrong pattern (do not use): + +```cpp +TEST_CASE("Issue #123", "[bug]") { + REQUIRE(result == "wrong_value"); +} +``` + +Right pattern: + +```cpp +TEST_CASE("Issue #123", "[bug][!shouldfail]") { + REQUIRE(result == "correct_value"); +} +``` + +Why: +- Bug is visible immediately as a failing test +- Test auto-passes once bug is fixed +- No TODO/update cycle required + +#### Catch2 Tags for Known Failing Tests + +- Expected failing bug test: `[bug][!shouldfail]` +- Or skip by default: `[.][bug]` + +#### Placement Rule: Edge Cases and Regressions at End of File + +In each test file: +- Mainline/general feature tests first +- Edge-case and regression tests last + +This keeps the top of files focused on broad feature coverage and groups known edge cases in one place. + +#### Pattern for Known-Bug Regression Tests + +```cpp +TEST_CASE("Feature XYZ - Issue #N", "[issue_N][!shouldfail]") { + // Expected: X + // Actual (buggy): Y + auto result = buggy_function(); + REQUIRE(result == correct_value); +} +``` + +#### Temporary File Cleanup Must Use RAII + +Never use manual `std::remove()` cleanup in tests. + +Always use `FileGuard` from `shared/file_guard.hpp` so files are cleaned up even if assertions fail. + +#### Path Testing Pattern +Most tests validate both code paths using a shared validation lambda: +```cpp +auto validate_reader = [&](CSVReader& reader) { + // Common validation logic for both paths +}; + +SECTION("Memory-mapped file path") { + CSVReader reader(filename); + validate_reader(reader); +} + +SECTION("std::istream path") { + std::ifstream infile(filename); + CSVReader reader(infile, CSVFormat()); + validate_reader(reader); +} +``` + +This pattern catches path-specific bugs like issue #281 (stream-only parsing error). + +#### File Cleanup +Tests use RAII cleanup via [FileGuard](shared/file_guard.hpp) — see the **Shared Test Utilities** section above for full usage. + +#### Timeout Guard for Race/Stress Tests + +Use [test_with_timeout](shared/timeout_helper.hpp) for tests that may hang under deadlock regressions. + +```cpp +#include "shared/timeout_helper.hpp" + +SECTION("Race-sensitive scenario") { + test_with_timeout([]() { + // loop / iterator logic that should complete quickly + }); +} +``` + +This gives explicit failures instead of CI hangs when synchronization regresses. + +### Test Files + +> **Rule**: Every `test_*.cpp` file in `tests/` **must** appear in `target_sources()` in `tests/CMakeLists.txt`. +> Files not listed there are silently never compiled or run. +> When adding a new test file, add it to CMakeLists.txt in the same commit. +> When asked to audit this, compare `ls tests/test_*.cpp` against the `target_sources()` list. + +- **test_error_handling.cpp**: Exception propagation from PR #282 + - Validates worker thread exceptions reach main thread + - Tests chunk boundary corruption detection + +- **test_round_trip.cpp**: Write/read integrity across 10MB boundaries + - Basic functionality → distinct values → quoted edge cases + - Tests both mmap and stream parsing paths + +- **test_csv_format.cpp**: CSV format detection and configuration + - Issue #285: no_header() preservation with delimiter guessing + +- **test_guess_csv.cpp**: Delimiter and header detection heuristics + - Issue #283: header detection with wide headers + +- **test_read_csv_file.cpp**: File reading and column access + - get_col_pos(), prevent column name overwriting + +- **test_write_csv.cpp**: CSV writing and numeric conversion + - Buffered vs non-buffered writing modes + +- **test_csv_row.cpp**, **test_csv_field.cpp**: Individual component tests + +- **test_csv_iterator.cpp**: Iterator functionality and edge cases + +- **test_csv_ranges.cpp**: Range-based for loop support + +- **test_csv_row_json.cpp**: JSON export functionality + +### Key Patterns + +1. **Validation Lambdas**: Write once, test both paths +2. **SECTION Grouping**: Organize related scenarios +3. **FileGuard RAII**: Guaranteed cleanup for temp files +4. **Timeout Guards**: Use `test_with_timeout()` for race/deadlock-sensitive tests +5. **Distinct Values**: Detect cross-field corruption +6. **Chunk Boundary Testing**: Cross 10MB ITERATION_CHUNK_SIZE + +### Data Files +Test data in `tests/data/` is a git submodule: +- `fake_data/`: Small synthetic CSV files for specific test scenarios +- `real_data/`: Larger datasets for performance/stress testing diff --git a/tests/CLAUDE.md b/tests/CLAUDE.md index aa683b9..293fc21 100644 --- a/tests/CLAUDE.md +++ b/tests/CLAUDE.md @@ -1,183 +1,26 @@ -## Test Architecture - -### Framework -- **Catch2 v3.6.0**: Modern C++ testing framework with SECTION support for testing multiple code paths - -### Shared Test Utilities (`tests/shared/`) - -**Always check `tests/shared/` before implementing any test helper from scratch.** - -| File | Purpose | -|------|---------| -| `shared/file_guard.hpp` | RAII temp-file cleanup — **use this for every temp file** | -| `shared/float_test_cases.hpp` | Shared floating-point edge-case data | -| `shared/timeout_helper.hpp` | Timeout wrapper for race/stress tests to prevent hangs | - -#### FileGuard — RAII temp file cleanup - -> **AI helpers**: the class is called `FileGuard`, not `TempFile`, `ScopedFile`, or `TempCSVFile`. -> Include it as `#include "shared/file_guard.hpp"`. Never use raw `std::remove()`. - -```cpp -#include "shared/file_guard.hpp" - -TEST_CASE("My test") { - FileGuard cleanup("./tests/data/tmp_foo.csv"); // deleted on scope exit - { - std::ofstream out(cleanup.filename, std::ios::binary); - out << "A,B\n1,2\n"; - } - CSVReader reader(cleanup.filename); // mmap path - REQUIRE(...); -} // std::remove() called here even if REQUIRE throws -``` - ---- - -### Test Organization - -### Testing Conventions - -#### Tests Should Expose Bugs, Not Assert Them - -When writing a test for a known bug, assert correct behavior (even if it currently fails), not buggy behavior. - -Wrong pattern (do not use): - -```cpp -TEST_CASE("Issue #123", "[bug]") { - REQUIRE(result == "wrong_value"); -} -``` - -Right pattern: - -```cpp -TEST_CASE("Issue #123", "[bug][!shouldfail]") { - REQUIRE(result == "correct_value"); -} -``` - -Why: -- Bug is visible immediately as a failing test -- Test auto-passes once bug is fixed -- No TODO/update cycle required - -#### Catch2 Tags for Known Failing Tests - -- Expected failing bug test: `[bug][!shouldfail]` -- Or skip by default: `[.][bug]` - -#### Placement Rule: Edge Cases and Regressions at End of File - -In each test file: -- Mainline/general feature tests first -- Edge-case and regression tests last - -This keeps the top of files focused on broad feature coverage and groups known edge cases in one place. - -#### Pattern for Known-Bug Regression Tests - -```cpp -TEST_CASE("Feature XYZ - Issue #N", "[issue_N][!shouldfail]") { - // Expected: X - // Actual (buggy): Y - auto result = buggy_function(); - REQUIRE(result == correct_value); -} -``` - -#### Temporary File Cleanup Must Use RAII - -Never use manual `std::remove()` cleanup in tests. - -Always use `FileGuard` from `shared/file_guard.hpp` so files are cleaned up even if assertions fail. - -#### Path Testing Pattern -Most tests validate both code paths using a shared validation lambda: -```cpp -auto validate_reader = [&](CSVReader& reader) { - // Common validation logic for both paths -}; - -SECTION("Memory-mapped file path") { - CSVReader reader(filename); - validate_reader(reader); -} - -SECTION("std::istream path") { - std::ifstream infile(filename); - CSVReader reader(infile, CSVFormat()); - validate_reader(reader); -} -``` - -This pattern catches path-specific bugs like issue #281 (stream-only parsing error). - -#### File Cleanup -Tests use RAII cleanup via [FileGuard](shared/file_guard.hpp) — see the **Shared Test Utilities** section above for full usage. - -#### Timeout Guard for Race/Stress Tests - -Use [test_with_timeout](shared/timeout_helper.hpp) for tests that may hang under deadlock regressions. - -```cpp -#include "shared/timeout_helper.hpp" - -SECTION("Race-sensitive scenario") { - test_with_timeout([]() { - // loop / iterator logic that should complete quickly - }); -} -``` - -This gives explicit failures instead of CI hangs when synchronization regresses. - -### Test Files - -> **Rule**: Every `test_*.cpp` file in `tests/` **must** appear in `target_sources()` in `tests/CMakeLists.txt`. -> Files not listed there are silently never compiled or run. -> When adding a new test file, add it to CMakeLists.txt in the same commit. -> When asked to audit this, compare `ls tests/test_*.cpp` against the `target_sources()` list. - -- **test_error_handling.cpp**: Exception propagation from PR #282 - - Validates worker thread exceptions reach main thread - - Tests chunk boundary corruption detection - -- **test_round_trip.cpp**: Write/read integrity across 10MB boundaries - - Basic functionality → distinct values → quoted edge cases - - Tests both mmap and stream parsing paths - -- **test_csv_format.cpp**: CSV format detection and configuration - - Issue #285: no_header() preservation with delimiter guessing - -- **test_guess_csv.cpp**: Delimiter and header detection heuristics - - Issue #283: header detection with wide headers - -- **test_read_csv_file.cpp**: File reading and column access - - get_col_pos(), prevent column name overwriting - -- **test_write_csv.cpp**: CSV writing and numeric conversion - - Buffered vs non-buffered writing modes - -- **test_csv_row.cpp**, **test_csv_field.cpp**: Individual component tests - -- **test_csv_iterator.cpp**: Iterator functionality and edge cases - -- **test_csv_ranges.cpp**: Range-based for loop support - -- **test_csv_row_json.cpp**: JSON export functionality - -### Key Patterns - -1. **Validation Lambdas**: Write once, test both paths -2. **SECTION Grouping**: Organize related scenarios -3. **FileGuard RAII**: Guaranteed cleanup for temp files -4. **Timeout Guards**: Use `test_with_timeout()` for race/deadlock-sensitive tests -5. **Distinct Values**: Detect cross-field corruption -6. **Chunk Boundary Testing**: Cross 10MB ITERATION_CHUNK_SIZE - -### Data Files -Test data in `tests/data/` is a git submodule: -- `fake_data/`: Small synthetic CSV files for specific test scenarios -- `real_data/`: Larger datasets for performance/stress testing +# CSV Parser Tests - Claude Summary + +> **`AGENTS.md` is the source of truth.** This file is a bullet-point summary only. Always load and follow `tests/AGENTS.md` — it takes precedence over anything here. + +## Test Checklist +- [ ] Both mmap and stream paths tested (Catch2 `SECTION`) +- [ ] ≥500K rows to cross 10MB chunk boundary +- [ ] Distinct values per column (not `i, i, i, i, i`) +- [ ] `FileGuard` used for all temp files — never raw `std::remove()` +- [ ] New `test_*.cpp` files added to `target_sources()` in `tests/CMakeLists.txt` +- [ ] Test data in `tests/data/fake_data`; `tests/data` is a git submodule + +## Key Conventions +- Lambda + `SECTION` pattern: write validation logic once, run on both paths +- Known-bug tests: assert correct behavior with `[bug][!shouldfail]`, not buggy behavior +- Edge-case and regression tests go at the **end** of each file +- Use `test_with_timeout()` from `shared/timeout_helper.hpp` for race/hang-sensitive tests + +## Shared Utilities (`tests/shared/`) +- `file_guard.hpp` — RAII temp file cleanup (`FileGuard`, not `TempFile` or `ScopedFile`) +- `float_test_cases.hpp` — shared floating-point edge-case data +- `timeout_helper.hpp` — `test_with_timeout()` for deadlock-sensitive tests + +## Distinct Column Values +- Bad: `array{i, i, i, i, i}` — corruption undetectable +- Good: `array{i*5+0, i*5+1, i*5+2, i*5+3, i*5+4}` — each column unique diff --git a/tests/shared/timeout_helper.hpp b/tests/shared/timeout_helper.hpp index d8de29e..be9e192 100644 --- a/tests/shared/timeout_helper.hpp +++ b/tests/shared/timeout_helper.hpp @@ -8,8 +8,9 @@ #pragma once #include +#include +#include #include -#include #include /** Execute a test function with a timeout @@ -30,17 +31,37 @@ * @param fn Test function to execute * @param timeout Maximum time to wait before failing (default: 10 seconds) * - * @throws std::runtime_error if timeout occurs - * @rethrows any exception thrown by fn + * @note On timeout, this helper fails the test via REQUIRE and does not join + * the worker thread. This avoids deadlocking the test thread while + * reporting a deterministic failure. + * @rethrows any exception thrown by fn (re-raised on the caller thread) */ template void test_with_timeout(Func fn, Duration timeout = std::chrono::seconds(10)) { - auto future = std::async(std::launch::async, fn); - + auto completion = std::make_shared>(); + auto future = completion->get_future(); + auto worker_exception = std::make_shared(); + + std::thread([fn = std::move(fn), completion, worker_exception]() mutable { + try { + fn(); + } + catch (...) { + *worker_exception = std::current_exception(); + } + + try { + completion->set_value(); + } + catch (...) { + // Promise may be abandoned on test shutdown paths. + } + }).detach(); + auto status = future.wait_for(timeout); - REQUIRE(status == std::future_status::ready); - - // Re-throw any exception from the test function - future.get(); + + if (*worker_exception) { + std::rethrow_exception(*worker_exception); + } } diff --git a/tests/test_csv_field.cpp b/tests/test_csv_field.cpp index e22b542..3242d8f 100644 --- a/tests/test_csv_field.cpp +++ b/tests/test_csv_field.cpp @@ -2,6 +2,7 @@ #include #include #include +#include using namespace csv; @@ -170,6 +171,24 @@ TEST_CASE("CSVField get<>() - Floating Point Value", "[test_csv_field_get_float] } } +TEST_CASE("CSVField try_get()", "[test_csv_field_try_get_long_double]") { + SECTION("Numeric value") { + CSVField field("2.718"); + long double out = 0; + + REQUIRE(field.try_get(out)); + REQUIRE(internals::is_equal(out, 2.718L)); + } + + SECTION("Non-numeric value") { + CSVField field("not-a-number"); + long double out = 123.0L; + + REQUIRE_FALSE(field.try_get(out)); + REQUIRE(internals::is_equal(out, 123.0L)); + } +} + TEST_CASE("CSVField try_parse_hex()", "[test_csv_field_parse_hex]") { long long value = 0; @@ -291,3 +310,12 @@ TEST_CASE("CSVField Equality Operator", "[test_csv_field_operator==]") { REQUIRE(field == 3.14f); REQUIRE(field == 3.14); } + +TEST_CASE("CSVField stream insertion operator", "[test_csv_field_stream_operator]") { + CSVField field("hello"); + std::stringstream out; + + out << field; + + REQUIRE(out.str() == " hello"); +} diff --git a/tests/test_data_frame.cpp b/tests/test_data_frame.cpp index f113262..4157eac 100644 --- a/tests/test_data_frame.cpp +++ b/tests/test_data_frame.cpp @@ -123,6 +123,18 @@ TEST_CASE("DataFrame: keyed helpers", "[data_frame]") { } } REQUIRE(found_carly); + + // Verify edits are visible through keyed const iteration (covers const_iterator edit overlay path) + const auto& cframe = frame; + bool found_carly_const = false; + bool found_bob_const = false; + for (auto cit = cframe.cbegin(); cit != cframe.cend(); ++cit) { + std::string name = (*cit)["name"].get(); + if (name == "Carly") found_carly_const = true; + if (name == "Bob") found_bob_const = true; + } + REQUIRE(found_carly_const); + REQUIRE(found_bob_const); // Verify DataFrameRow stores key and can be converted to vector auto row_0 = frame.at(0); diff --git a/tests/test_guess_csv.cpp b/tests/test_guess_csv.cpp index d8c8394..187c286 100644 --- a/tests/test_guess_csv.cpp +++ b/tests/test_guess_csv.cpp @@ -74,4 +74,32 @@ TEST_CASE("guess_delim() Test - Comments Before Header", "[test_guess_comments_b REQUIRE(col_names[0] == "a"); REQUIRE(col_names[1] == "b"); REQUIRE(col_names[2] == "c"); +} + +TEST_CASE("get_col_names(filename, format)", "[test_get_col_names_filename_format]") { + const std::string path = "./tests/data/fake_data/comments_before_header.csv"; + + SECTION("Guessed delimiter and header row") { + CSVFormat format; + format.delimiter({ ',', ';' }); + + auto col_names = get_col_names(path, format); + + REQUIRE(col_names.size() == 3); + REQUIRE(col_names[0] == "a"); + REQUIRE(col_names[1] == "b"); + REQUIRE(col_names[2] == "c"); + } + + SECTION("Explicit delimiter and header row") { + CSVFormat format; + format.delimiter(';').header_row(2); + + auto col_names = get_col_names(path, format); + + REQUIRE(col_names.size() == 3); + REQUIRE(col_names[0] == "a"); + REQUIRE(col_names[1] == "b"); + REQUIRE(col_names[2] == "c"); + } } \ No newline at end of file diff --git a/tests/test_write_csv.cpp b/tests/test_write_csv.cpp index 96c2edd..de5d2bb 100644 --- a/tests/test_write_csv.cpp +++ b/tests/test_write_csv.cpp @@ -161,6 +161,31 @@ TEMPLATE_TEST_CASE("CSV/TSV Writer - operator <<", "[test_csv_operator<<]", } //! [CSV Writer Example] +//! [CSV Reordering Example] +TEST_CASE("CSV Writer - Reorder Columns", "[test_csv_reorder]") { + auto rows = "A,B,C\r\n" + "1,2,3\r\n" + "4,5,6"_csv; + + std::stringstream output, correct; + auto writer = make_csv_writer(output); + + writer << std::vector({ "C", "A" }); + for (auto& row : rows) { + writer << std::vector({ + row["C"].get(), + row["A"].get() + }); + } + + correct << "C,A" << std::endl + << "3,1" << std::endl + << "6,4" << std::endl; + + REQUIRE(output.str() == correct.str()); +} +//! [CSV Reordering Example] + //! [CSV Writer Tuple Example] struct Time { std::string hour;