From 19fa4cc00cc748123c267ad4de19dab0adeac9e7 Mon Sep 17 00:00:00 2001 From: Matthew Hambrecht Date: Mon, 18 May 2026 22:20:04 -0400 Subject: [PATCH] P1 --- .github/workflows/integration.yml | 15 +- .github/workflows/quality.yml | 20 +- .github/workflows/unit.yml | 8 +- .gitignore | 1 + Cargo.lock | 2044 +++++++++++++++++++-- Cargo.toml | 64 +- README.md | 42 +- docs/CI.md | 25 - examples/common/mod.rs | 1 + examples/common/sparse_fill_visualizer.rs | 6 +- examples/common/sparse_materialization.rs | 20 +- examples/common/utils.rs | 33 + examples/file_to_file.md | 35 +- src/common/chunks.rs | 51 + src/common/codec.rs | 24 + src/common/mod.rs | 2 + src/coverage.rs | 21 + src/metadata/mod.rs | 54 + src/metadata/spec.rs | 94 + src/shared.rs | 1 + src/sources/file.rs | 547 +----- src/sources/http.rs | 298 --- src/sources/mod.rs | 8 +- src/sources/opendal.rs | 87 + src/utils/flaky.rs | 35 +- src/utils/mod.rs | 2 +- src/utils/oracle.rs | 123 +- src/writer.rs | 47 +- tests/core.rs | 271 ++- tests/file.rs | 217 ++- tests/harness.rs | 22 +- tests/http.rs | 207 --- tests/opendal.rs | 85 + 33 files changed, 2887 insertions(+), 1623 deletions(-) create mode 100644 examples/common/utils.rs create mode 100644 src/common/chunks.rs create mode 100644 src/common/codec.rs create mode 100644 src/common/mod.rs create mode 100644 src/metadata/mod.rs create mode 100644 src/metadata/spec.rs delete mode 100644 src/sources/http.rs create mode 100644 src/sources/opendal.rs delete mode 100644 tests/http.rs create mode 100644 tests/opendal.rs diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 7bbfbdd..8501966 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -28,11 +28,14 @@ jobs: with: shared-key: cargo-${{ inputs.runner }} - - name: File feature integration tests - run: cargo nextest run --features utils,file + - name: Core integration tests + run: cargo nextest run --lib core --features test-utils - - name: HTTP feature integration tests - run: cargo nextest run --features utils,http + - name: Harness implementation integration tests + run: cargo nextest run --lib harness --features debug - - name: Full feature matrix - run: cargo nextest run --all-features + - name: File implementation integration tests + run: cargo nextest run --lib file --features test-utils,impl-file + + - name: OpenDAL implementation integration tests + run: cargo nextest run --lib opendal --features test-utils,impl-opendal \ No newline at end of file diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml index 3078512..ea8fec4 100644 --- a/.github/workflows/quality.yml +++ b/.github/workflows/quality.yml @@ -22,6 +22,9 @@ jobs: with: components: clippy + - name: Install cargo-hack + run: cargo install cargo-hack + - uses: Swatinem/rust-cache@v2 with: shared-key: cargo-${{ inputs.runner }} @@ -31,24 +34,15 @@ jobs: - name: Rustfmt run: | - cargo +nightly fmt --all - if ! git diff --quiet; then - git status --short - git diff --stat - exit 1 - fi - + cargo +nightly fmt --all -- --check + - name: Clippy run: cargo clippy --all-features --all-targets -- -D warnings - name: Feature compile checks run: | - cargo check - cargo check --tests - cargo check --features file - cargo check --features http - cargo check --features debug - cargo check --all-features + cargo hack check --each-feature + cargo hack check --no-default-features - name: Docs.rs feature set run: RUSTDOCFLAGS='--cfg docsrs' cargo doc --all-features --no-deps diff --git a/.github/workflows/unit.yml b/.github/workflows/unit.yml index ede2270..aa5c51d 100644 --- a/.github/workflows/unit.yml +++ b/.github/workflows/unit.yml @@ -29,10 +29,4 @@ jobs: shared-key: cargo-${{ inputs.runner }} - name: Unit tests - run: cargo nextest run --features utils - - - name: Debug harness tests - run: cargo nextest run --features utils,debug - - - name: Doc tests - run: cargo test --doc --all-features + run: cargo test --lib --all-features diff --git a/.gitignore b/.gitignore index 7d249f7..b6d5a2b 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ target # Debug artifacts *.vgcore heaptrack.* +.cruft/ # Python artifacts .venv/ diff --git a/Cargo.lock b/Cargo.lock index 5c8f2e5..c6eefb1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,6 +11,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + [[package]] name = "anstream" version = "1.0.0" @@ -62,15 +71,32 @@ dependencies = [ ] [[package]] -name = "assert-json-diff" -version = "2.0.2" +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "approx" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47e4f2b81832e72834d7518d8487a0396a28cc408186a2e8854c0f98011faf12" +checksum = "cab112f0a86d568ea0e627cc1d6be74a1e9cd55214684db5561995f6dad897c6" dependencies = [ - "serde", - "serde_json", + "num-traits", ] +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + [[package]] name = "async-stream" version = "0.3.6" @@ -93,30 +119,129 @@ dependencies = [ "syn", ] +[[package]] +name = "async-trait" +version = "0.1.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "atomic-waker" version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "aws-lc-rs" +version = "1.16.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ec6fb3fe69024a75fa7e1bfb48aa6cf59706a101658ea01bfd33b2b248a038f" +dependencies = [ + "aws-lc-sys", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.40.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f50037ee5e1e41e7b8f9d161680a725bd1626cb6f8c7e901f91f942850852fe7" +dependencies = [ + "cc", + "cmake", + "dunce", + "fs_extra", +] + [[package]] name = "base64" version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bincode" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740" +dependencies = [ + "bincode_derive", + "serde", + "unty", +] + +[[package]] +name = "bincode_derive" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09" +dependencies = [ + "virtue", +] + [[package]] name = "bitflags" version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" +[[package]] +name = "blake3" +version = "1.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2468ef7d57b3fb7e16b576e8377cdbde2320c60e1491e961d11da40fc4f02a2d" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if 1.0.4", + "constant_time_eq", + "cpufeatures 0.2.17", +] + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "bstr" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab" +dependencies = [ + "memchr", + "regex-automata", + "serde", +] + [[package]] name = "bumpalo" version = "3.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" +[[package]] +name = "bytemuck" +version = "1.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" + [[package]] name = "bytes" version = "1.11.1" @@ -130,9 +255,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43c5703da9466b66a946814e1adf53ea2c90f10063b86290cc9eb67ce3478a20" dependencies = [ "find-msvc-tools", + "jobserver", + "libc", "shlex", ] +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + [[package]] name = "cfg-if" version = "1.0.4" @@ -145,6 +278,30 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" +[[package]] +name = "chacha20" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" +dependencies = [ + "cfg-if 1.0.4", + "cpufeatures 0.3.0", + "rand_core 0.10.1", +] + +[[package]] +name = "chrono" +version = "0.4.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" +dependencies = [ + "iana-time-zone", + "js-sys", + "num-traits", + "wasm-bindgen", + "windows-link", +] + [[package]] name = "clap" version = "4.6.0" @@ -185,6 +342,15 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" +[[package]] +name = "cmake" +version = "0.1.58" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678" +dependencies = [ + "cc", +] + [[package]] name = "colorchoice" version = "1.0.5" @@ -200,6 +366,209 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "combine" +version = "4.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd" +dependencies = [ + "bytes", + "memchr", +] + +[[package]] +name = "const-oid" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" + +[[package]] +name = "const-str" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18f12cc9948ed9604230cdddc7c86e270f9401ccbe3c2e98a4378c5e7632212f" + +[[package]] +name = "const_panic" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e262cdaac42494e3ae34c43969f9cdeb7da178bdb4b66fa6a1ea2edb4c8ae652" +dependencies = [ + "typewit", +] + +[[package]] +name = "constant_time_eq" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" + +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "countio" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9702aee5d1d744c01d82f6915644f950f898e014903385464c773b96fefdecb" +dependencies = [ + "futures-io", +] + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if 1.0.4", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "csv" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde_core", +] + +[[package]] +name = "csv-core" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" +dependencies = [ + "memchr", +] + +[[package]] +name = "ctor" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "424e0138278faeb2b401f174ad17e715c829512d74f3d1e81eb43365c2e0590e" +dependencies = [ + "ctor-proc-macro", + "dtor", +] + +[[package]] +name = "ctor-proc-macro" +version = "0.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52560adf09603e58c9a7ee1fe1dcb95a16927b17c127f0ac02d6e768a0e25bc1" + +[[package]] +name = "deranged" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" +dependencies = [ + "powerfmt", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "const-oid", + "crypto-common", + "subtle", +] + +[[package]] +name = "dirs" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3e8aa94d75141228480295a7d0e7feb620b1a5ad9f12bc40be62411e38cce4e" +dependencies = [ + "dirs-sys", +] + +[[package]] +name = "dirs-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab" +dependencies = [ + "libc", + "option-ext", + "redox_users", + "windows-sys 0.61.2", +] + [[package]] name = "displaydoc" version = "0.2.5" @@ -211,6 +580,33 @@ dependencies = [ "syn", ] +[[package]] +name = "dtor" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "404d02eeb088a82cfd873006cb713fe411306c7d182c344905e101fb1167d301" +dependencies = [ + "dtor-proc-macro", +] + +[[package]] +name = "dtor-proc-macro" +version = "0.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f678cf4a922c215c63e0de95eb1ff08a958a81d47e485cf9da1e27bf6305cfa5" + +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + [[package]] name = "equivalent" version = "1.0.2" @@ -240,10 +636,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" [[package]] -name = "fnv" -version = "1.0.7" +name = "foldhash" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" [[package]] name = "form_urlencoded" @@ -254,6 +650,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + [[package]] name = "futures" version = "0.3.32" @@ -342,16 +744,35 @@ dependencies = [ "slab", ] +[[package]] +name = "gearhash" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8cf82cf76cd16485e56295a1377c775ce708c9f1a0be6b029076d60a245d213" +dependencies = [ + "cfg-if 0.1.10", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getrandom" version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" dependencies = [ - "cfg-if", + "cfg-if 1.0.4", "js-sys", "libc", - "wasi", + "wasi 0.11.1+wasi-snapshot-preview1", "wasm-bindgen", ] @@ -361,31 +782,57 @@ version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ - "cfg-if", + "cfg-if 1.0.4", "js-sys", "libc", - "r-efi", + "r-efi 5.3.0", "wasip2", "wasm-bindgen", ] [[package]] -name = "h2" -version = "0.4.13" +name = "getrandom" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f44da3a8150a6703ed5d34e164b875fd14c2cdab9af1252a9a1020bde2bdc54" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" dependencies = [ - "atomic-waker", - "bytes", - "fnv", - "futures-core", - "futures-sink", - "http", - "indexmap", - "slab", - "tokio", - "tokio-util", - "tracing", + "cfg-if 1.0.4", + "js-sys", + "libc", + "r-efi 6.0.0", + "rand_core 0.10.1", + "wasip2", + "wasip3", + "wasm-bindgen", +] + +[[package]] +name = "git-version" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ad568aa3db0fcbc81f2f116137f263d7304f512a1209b35b85150d3ef88ad19" +dependencies = [ + "git-version-macro", +] + +[[package]] +name = "git-version-macro" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "foldhash", ] [[package]] @@ -394,12 +841,55 @@ version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" +[[package]] +name = "heapify" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0049b265b7f201ca9ab25475b22b47fe444060126a51abe00f77d986fc5cc52e" + [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "hf-xet" +version = "1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "430b33fa84f92796d4d263070b6c0d3ca219df7b9a0e1853ee431029b1612bcd" +dependencies = [ + "async-trait", + "bytes", + "http", + "more-asserts", + "serde", + "thiserror", + "tokio", + "tokio-util", + "tracing", + "uuid", + "xet-client", + "xet-core-structures", + "xet-data", + "xet-runtime", +] + +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + [[package]] name = "http" version = "1.4.0" @@ -440,10 +930,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" [[package]] -name = "httpdate" -version = "1.0.3" +name = "humantime" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" +checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" [[package]] name = "hyper" @@ -455,11 +945,9 @@ dependencies = [ "bytes", "futures-channel", "futures-core", - "h2", "http", "http-body", "httparse", - "httpdate", "itoa", "pin-project-lite", "smallvec", @@ -481,7 +969,6 @@ dependencies = [ "tokio", "tokio-rustls", "tower-service", - "webpki-roots", ] [[package]] @@ -502,9 +989,35 @@ dependencies = [ "percent-encoding", "pin-project-lite", "socket2", + "system-configuration", "tokio", "tower-service", "tracing", + "windows-registry", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", ] [[package]] @@ -589,6 +1102,12 @@ dependencies = [ "zerovec", ] +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + [[package]] name = "idna" version = "1.1.0" @@ -617,7 +1136,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", - "hashbrown", + "hashbrown 0.17.0", + "serde", + "serde_core", ] [[package]] @@ -642,36 +1163,179 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" +[[package]] +name = "jiff" +version = "0.2.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f00b5dbd620d61dfdcb6007c9c1f6054ebd75319f163d886a9055cec1155073d" +dependencies = [ + "jiff-static", + "jiff-tzdb-platform", + "js-sys", + "log", + "portable-atomic", + "portable-atomic-util", + "serde_core", + "wasm-bindgen", + "windows-sys 0.61.2", +] + +[[package]] +name = "jiff-static" +version = "0.2.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e000de030ff8022ea1da3f466fbb0f3a809f5e51ed31f6dd931c35181ad8e6d7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "jiff-tzdb" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c900ef84826f1338a557697dc8fc601df9ca9af4ac137c7fb61d4c6f2dfd3076" + +[[package]] +name = "jiff-tzdb-platform" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "875a5a69ac2bab1a891711cf5eccbec1ce0341ea805560dcd90b7a2e925132e8" +dependencies = [ + "jiff-tzdb", +] + +[[package]] +name = "jni" +version = "0.22.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5efd9a482cf3a427f00d6b35f14332adc7902ce91efb778580e180ff90fa3498" +dependencies = [ + "cfg-if 1.0.4", + "combine", + "jni-macros", + "jni-sys", + "log", + "simd_cesu8", + "thiserror", + "walkdir", + "windows-link", +] + +[[package]] +name = "jni-macros" +version = "0.22.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a00109accc170f0bdb141fed3e393c565b6f5e072365c3bd58f5b062591560a3" +dependencies = [ + "proc-macro2", + "quote", + "rustc_version", + "simd_cesu8", + "syn", +] + +[[package]] +name = "jni-sys" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6377a88cb3910bee9b0fa88d4f42e1d2da8e79915598f65fb0c7ee14c878af2" +dependencies = [ + "jni-sys-macros", +] + +[[package]] +name = "jni-sys-macros" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264" +dependencies = [ + "quote", + "syn", +] + +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] + [[package]] name = "js-sys" version = "0.3.94" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2e04e2ef80ce82e13552136fabeef8a5ed1f985a96805761cbb9a2c34e7664d9" dependencies = [ - "cfg-if", + "cfg-if 1.0.4", "futures-util", "once_cell", "wasm-bindgen", ] +[[package]] +name = "konst" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f660d5f887e3562f9ab6f4a14988795b694099d66b4f5dedc02d197ba9becb1d" +dependencies = [ + "const_panic", + "konst_proc_macros", + "typewit", +] + +[[package]] +name = "konst_proc_macros" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" + [[package]] name = "lazy_static" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + [[package]] name = "libc" version = "0.2.184" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48f5d2a454e16a5ea0f4ced81bd44e4cfc7bd3a507b61887c99fd3538b28e4af" +[[package]] +name = "libredox" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e02f3bb43d335493c96bf3fd3a321600bf6bd07ed34bc64118e9293bdffea46c" +dependencies = [ + "libc", +] + [[package]] name = "linux-raw-sys" version = "0.12.1" @@ -684,15 +1348,6 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" -[[package]] -name = "lock_api" -version = "0.4.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" -dependencies = [ - "scopeguard", -] - [[package]] name = "log" version = "0.4.29" @@ -705,6 +1360,43 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" +[[package]] +name = "lz4_flex" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db9a0d582c2874f68138a16ce1867e0ffde6c0bb0a0df85e1f36d04146db488a" +dependencies = [ + "twox-hash", +] + +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if 1.0.4", + "digest", +] + +[[package]] +name = "mea" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6747f54621d156e1b47eb6b25f39a941b9fc347f98f67d25d8881ff99e8ed832" +dependencies = [ + "slab", +] + [[package]] name = "memchr" version = "2.8.0" @@ -718,33 +1410,23 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1" dependencies = [ "libc", - "wasi", + "wasi 0.11.1+wasi-snapshot-preview1", "windows-sys 0.61.2", ] [[package]] -name = "mockito" -version = "1.7.2" +name = "more-asserts" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fafa6961cabd9c63bcd77a45d7e3b7f3b552b70417831fb0f56db717e72407e" + +[[package]] +name = "ntapi" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90820618712cab19cfc46b274c6c22546a82affcb3c3bdf0f29e3db8e1bb92c0" +checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae" dependencies = [ - "assert-json-diff", - "bytes", - "colored", - "futures-core", - "http", - "http-body", - "http-body-util", - "hyper", - "hyper-util", - "log", - "pin-project-lite", - "rand 0.9.2", - "regex", - "serde_json", - "serde_urlencoded", - "similar", - "tokio", + "winapi", ] [[package]] @@ -757,84 +1439,247 @@ dependencies = [ ] [[package]] -name = "once_cell" -version = "1.21.4" +name = "num-conv" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" +checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967" [[package]] -name = "once_cell_polyfill" -version = "1.70.2" +name = "num-traits" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] [[package]] -name = "parking_lot" -version = "0.12.5" +name = "objc2-core-foundation" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536" dependencies = [ - "lock_api", - "parking_lot_core", + "bitflags", ] [[package]] -name = "parking_lot_core" -version = "0.9.12" +name = "objc2-io-kit" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +checksum = "33fafba39597d6dc1fb709123dfa8289d39406734be322956a69f0931c73bb15" dependencies = [ - "cfg-if", "libc", - "redox_syscall", - "smallvec", - "windows-link", + "objc2-core-foundation", ] [[package]] -name = "percent-encoding" -version = "2.3.2" +name = "objc2-system-configuration" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" +checksum = "7216bd11cbda54ccabcab84d523dc93b858ec75ecfb3a7d89513fa22464da396" +dependencies = [ + "objc2-core-foundation", +] [[package]] -name = "pin-project-lite" -version = "0.2.17" +name = "once_cell" +version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" [[package]] -name = "potential_utf" -version = "0.1.5" +name = "once_cell_polyfill" +version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564" -dependencies = [ - "zerovec", -] +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" [[package]] -name = "ppv-lite86" -version = "0.2.21" +name = "oneshot" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" -dependencies = [ - "zerocopy", -] +checksum = "269bca4c2591a28585d6bf10d9ed0332b7d76900a1b02bec41bdc3a2cdcda107" [[package]] -name = "proc-macro2" -version = "1.0.106" +name = "opendal" +version = "0.56.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +checksum = "97b31d3d8e99a85d83b73ec26647f5607b80578ed9375810b6e44ffa3590a236" dependencies = [ - "unicode-ident", + "opendal-core", + "opendal-service-hf", ] [[package]] -name = "quinn" -version = "0.11.9" +name = "opendal-core" +version = "0.56.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" +checksum = "1849dd2687e173e776d3af5fce1ba3ae47b9dd37a09d1c4deba850ef45fe00ca" +dependencies = [ + "anyhow", + "base64", + "bytes", + "futures", + "http", + "http-body", + "jiff", + "log", + "md-5", + "mea", + "percent-encoding", + "quick-xml", + "reqsign-core", + "reqwest", + "serde", + "serde_json", + "tokio", + "url", + "uuid", + "web-time", +] + +[[package]] +name = "opendal-service-hf" +version = "0.56.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b2ab7a2a8a11dfe257ef4db5c0de798acbcd0d6429c37382dad2154bc06a388" +dependencies = [ + "bytes", + "hf-xet", + "http", + "log", + "opendal-core", + "percent-encoding", + "reqwest", + "serde", + "serde_json", +] + +[[package]] +name = "openssl-probe" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" + +[[package]] +name = "option-ext" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" + +[[package]] +name = "os_str_bytes" +version = "6.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2355d85b9a3786f481747ced0e0ff2ba35213a1f9bd406ed906554d7af805a1" +dependencies = [ + "memchr", +] + +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + +[[package]] +name = "pin-project" +version = "1.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1749c7ed4bcaf4c3d0a3efc28538844fb29bcdd7d2b67b2be7e20ba861ff517" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b20ed30f105399776b9c883e68e536ef602a16ae6f596d2c473591d6ad64c6" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + +[[package]] +name = "portable-atomic-util" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a106d1259c23fac8e543272398ae0e3c0b8d33c88ed73d0cc71b0f1d902618" +dependencies = [ + "portable-atomic", +] + +[[package]] +name = "potential_utf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564" +dependencies = [ + "zerovec", +] + +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quick-xml" +version = "0.38.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "quinn" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" dependencies = [ "bytes", "cfg_aliases", @@ -856,6 +1701,7 @@ version = "0.11.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098" dependencies = [ + "aws-lc-rs", "bytes", "getrandom 0.3.4", "lru-slab", @@ -900,6 +1746,12 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + [[package]] name = "rand" version = "0.8.5" @@ -921,6 +1773,17 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rand" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" +dependencies = [ + "chacha20", + "getrandom 0.4.2", + "rand_core 0.10.1", +] + [[package]] name = "rand_chacha" version = "0.3.1" @@ -960,12 +1823,29 @@ dependencies = [ ] [[package]] -name = "redox_syscall" -version = "0.5.18" +name = "rand_core" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69" + +[[package]] +name = "redb" +version = "3.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +checksum = "ef99362319c782aa4639ad3a306b64c3bb90e12874e99b8df124cb679d988611" dependencies = [ - "bitflags", + "libc", +] + +[[package]] +name = "redox_users" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" +dependencies = [ + "getrandom 0.2.17", + "libredox", + "thiserror", ] [[package]] @@ -997,15 +1877,38 @@ version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" +[[package]] +name = "reqsign-core" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b10302cf0a7d7e7352ba211fc92c3c5bebf1286153e49cc5aa87348078a8e102" +dependencies = [ + "anyhow", + "base64", + "bytes", + "form_urlencoded", + "futures", + "hex", + "hmac", + "http", + "jiff", + "log", + "percent-encoding", + "sha1", + "sha2", + "windows-sys 0.61.2", +] + [[package]] name = "reqwest" -version = "0.12.28" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" +checksum = "62e0021ea2c22aed41653bc7e1419abb2c97e038ff2c33d0e1309e49a97deec0" dependencies = [ "base64", "bytes", "futures-core", + "futures-util", "http", "http-body", "http-body-util", @@ -1019,20 +1922,35 @@ dependencies = [ "quinn", "rustls", "rustls-pki-types", + "rustls-platform-verifier", "serde", "serde_json", - "serde_urlencoded", "sync_wrapper", "tokio", "tokio-rustls", + "tokio-util", "tower", "tower-http", "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", + "wasm-streams", "web-sys", - "webpki-roots", +] + +[[package]] +name = "reqwest-middleware" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "199dda04a536b532d0cc04d7979e39b1c763ea749bf91507017069c00b96056f" +dependencies = [ + "anyhow", + "async-trait", + "http", + "reqwest", + "thiserror", + "tower-service", ] [[package]] @@ -1042,7 +1960,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" dependencies = [ "cc", - "cfg-if", + "cfg-if 1.0.4", "getrandom 0.2.17", "libc", "untrusted", @@ -1055,6 +1973,15 @@ version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + [[package]] name = "rustix" version = "1.1.4" @@ -1074,14 +2001,26 @@ version = "0.23.37" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4" dependencies = [ + "aws-lc-rs", "once_cell", - "ring", "rustls-pki-types", "rustls-webpki", "subtle", "zeroize", ] +[[package]] +name = "rustls-native-certs" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" +dependencies = [ + "openssl-probe", + "rustls-pki-types", + "schannel", + "security-framework", +] + [[package]] name = "rustls-pki-types" version = "1.14.0" @@ -1092,12 +2031,40 @@ dependencies = [ "zeroize", ] +[[package]] +name = "rustls-platform-verifier" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26d1e2536ce4f35f4846aa13bff16bd0ff40157cdb14cc056c7b14ba41233ba0" +dependencies = [ + "core-foundation 0.10.1", + "core-foundation-sys", + "jni", + "log", + "once_cell", + "rustls", + "rustls-native-certs", + "rustls-platform-verifier-android", + "rustls-webpki", + "security-framework", + "security-framework-sys", + "webpki-root-certs", + "windows-sys 0.61.2", +] + +[[package]] +name = "rustls-platform-verifier-android" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f" + [[package]] name = "rustls-webpki" version = "0.103.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20a6af516fea4b20eccceaf166e8aa666ac996208e8a644ce3ef5aa783bc7cd4" dependencies = [ + "aws-lc-rs", "ring", "rustls-pki-types", "untrusted", @@ -1116,10 +2083,57 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" [[package]] -name = "scopeguard" -version = "1.2.0" +name = "safe-transmute" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3944826ff8fa8093089aba3acb4ef44b9446a99a16f3bf4e74af3f77d340ab7d" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "schannel" +version = "0.1.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "security-framework" +version = "3.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" +dependencies = [ + "bitflags", + "core-foundation 0.10.1", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "semver" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" [[package]] name = "serde" @@ -1165,41 +2179,93 @@ dependencies = [ ] [[package]] -name = "serde_urlencoded" -version = "0.7.1" +name = "serde_repr" +version = "0.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" dependencies = [ - "form_urlencoded", - "itoa", - "ryu", - "serde", + "proc-macro2", + "quote", + "syn", ] [[package]] -name = "sharded-slab" -version = "0.1.7" +name = "sha1" +version = "0.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" dependencies = [ - "lazy_static", + "cfg-if 1.0.4", + "cpufeatures 0.2.17", + "digest", ] [[package]] -name = "shlex" -version = "1.3.0" +name = "sha2" +version = "0.10.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if 1.0.4", + "cpufeatures 0.2.17", + "digest", + "sha2-asm", +] [[package]] -name = "similar" -version = "2.7.0" +name = "sha2-asm" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbbb5d9659141646ae647b42fe094daf6c6192d1620870b449d9557f748b2daa" +checksum = "b845214d6175804686b2bd482bcffe96651bb2d1200742b712003504a2dac1ab" +dependencies = [ + "cc", +] [[package]] -name = "slab" -version = "0.4.12" +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "shellexpand" +version = "3.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32824fab5e16e6c4d86dc1ba84489390419a39f97699852b66480bb87d297ed8" +dependencies = [ + "bstr", + "dirs", + "os_str_bytes", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "simd_cesu8" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94f90157bb87cddf702797c5dadfa0be7d266cdf49e22da2fcaa32eff75b2c33" +dependencies = [ + "rustc_version", + "simdutf8", +] + +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + +[[package]] +name = "slab" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" @@ -1224,13 +2290,17 @@ name = "sparseio" version = "0.0.1" dependencies = [ "async-stream", + "bincode", "bytes", "clap", "futures", + "hex", "libc", - "mockito", + "opendal", "rand 0.8.5", "reqwest", + "serde", + "sha2", "tempfile", "tokio", "tracing", @@ -1243,6 +2313,22 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "statrs" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a3fe7c28c6512e766b0874335db33c94ad7b8f9054228ae1c2abd47ce7d335e" +dependencies = [ + "approx", + "num-traits", +] + [[package]] name = "strsim" version = "0.11.1" @@ -1255,6 +2341,12 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" +[[package]] +name = "symlink" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7973cce6668464ea31f176d85b13c7ab3bba2cb3b77a2ed26abd7801688010a" + [[package]] name = "syn" version = "2.0.117" @@ -1286,6 +2378,41 @@ dependencies = [ "syn", ] +[[package]] +name = "sysinfo" +version = "0.38.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92ab6a2f8bfe508deb3c6406578252e491d299cbbf3bc0529ecc3313aee4a52f" +dependencies = [ + "libc", + "memchr", + "ntapi", + "objc2-core-foundation", + "objc2-io-kit", + "windows", +] + +[[package]] +name = "system-configuration" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b" +dependencies = [ + "bitflags", + "core-foundation 0.9.4", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "tempfile" version = "3.27.0" @@ -1293,7 +2420,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" dependencies = [ "fastrand", - "getrandom 0.3.4", + "getrandom 0.4.2", "once_cell", "rustix", "windows-sys 0.61.2", @@ -1325,7 +2452,38 @@ version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" dependencies = [ - "cfg-if", + "cfg-if 1.0.4", +] + +[[package]] +name = "time" +version = "0.3.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" +dependencies = [ + "deranged", + "itoa", + "num-conv", + "powerfmt", + "serde_core", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" + +[[package]] +name = "time-macros" +version = "0.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" +dependencies = [ + "num-conv", + "time-core", ] [[package]] @@ -1362,7 +2520,6 @@ dependencies = [ "bytes", "libc", "mio", - "parking_lot", "pin-project-lite", "socket2", "tokio-macros", @@ -1380,6 +2537,17 @@ dependencies = [ "syn", ] +[[package]] +name = "tokio-retry" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40f644c762e9d396831ae2f8935c954b0d758c4532e924bead0f666d0c1c8640" +dependencies = [ + "pin-project-lite", + "rand 0.10.1", + "tokio", +] + [[package]] name = "tokio-rustls" version = "0.26.4" @@ -1455,9 +2623,34 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" dependencies = [ "pin-project-lite", + "tracing-attributes", "tracing-core", ] +[[package]] +name = "tracing-appender" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "050686193eb999b4bb3bc2acfa891a13da00f79734704c4b8b4ef1a10b368a3c" +dependencies = [ + "crossbeam-channel", + "symlink", + "thiserror", + "time", + "tracing-subscriber", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "tracing-core" version = "0.1.36" @@ -1479,18 +2672,35 @@ dependencies = [ "tracing-core", ] +[[package]] +name = "tracing-serde" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1" +dependencies = [ + "serde", + "tracing-core", +] + [[package]] name = "tracing-subscriber" version = "0.3.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319" dependencies = [ + "matchers", "nu-ansi-term", + "once_cell", + "regex-automata", + "serde", + "serde_json", "sharded-slab", "smallvec", "thread_local", + "tracing", "tracing-core", "tracing-log", + "tracing-serde", ] [[package]] @@ -1499,18 +2709,48 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "twox-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" + +[[package]] +name = "typenum" +version = "1.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de" + +[[package]] +name = "typewit" +version = "1.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "214ca0b2191785cbc06209b9ca1861e048e39b5ba33574b3cedd58363d5bb5f6" + [[package]] name = "unicode-ident" version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + [[package]] name = "untrusted" version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" +[[package]] +name = "unty" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" + [[package]] name = "url" version = "2.5.8" @@ -1523,6 +2763,12 @@ dependencies = [ "serde", ] +[[package]] +name = "urlencoding" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" + [[package]] name = "utf8_iter" version = "1.0.4" @@ -1535,12 +2781,46 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "uuid" +version = "1.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd74a9687298c6858e9b88ec8935ec45d22e8fd5e6394fa1bd4e99a87789c76" +dependencies = [ + "getrandom 0.4.2", + "js-sys", + "serde_core", + "wasm-bindgen", +] + [[package]] name = "valuable" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "virtue" +version = "0.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "want" version = "0.3.1" @@ -1556,13 +2836,40 @@ version = "0.11.1+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" +[[package]] +name = "wasi" +version = "0.14.7+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "883478de20367e224c0090af9cf5f9fa85bed63a95c1abf3afc5c083ebc06e8c" +dependencies = [ + "wasip2", +] + [[package]] name = "wasip2" version = "1.0.1+wasi-0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" dependencies = [ - "wit-bindgen", + "wit-bindgen 0.46.0", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen 0.51.0", +] + +[[package]] +name = "wasite" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66fe902b4a6b8028a753d5424909b764ccf79b7a209eac9bf97e59cda9f71a42" +dependencies = [ + "wasi 0.14.7+wasi-0.2.4", ] [[package]] @@ -1571,7 +2878,7 @@ version = "0.2.117" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0551fc1bb415591e3372d0bc4780db7e587d84e2a7e79da121051c5c4b89d0b0" dependencies = [ - "cfg-if", + "cfg-if 1.0.4", "once_cell", "rustversion", "wasm-bindgen-macro", @@ -1620,6 +2927,53 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasm-streams" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d1ec4f6517c9e11ae630e200b2b65d193279042e28edd4a2cda233e46670bbb" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap", + "semver", +] + [[package]] name = "web-sys" version = "0.3.94" @@ -1641,20 +2995,170 @@ dependencies = [ ] [[package]] -name = "webpki-roots" -version = "1.0.6" +name = "webpki-root-certs" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed" +checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c" dependencies = [ "rustls-pki-types", ] +[[package]] +name = "whoami" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6a5b12f9df4f978d2cfdb1bd3bac52433f44393342d7ee9c25f5a1c14c0f45d" +dependencies = [ + "libc", + "libredox", + "objc2-system-configuration", + "wasite", + "web-sys", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "527fadee13e0c05939a6a05d5bd6eec6cd2e3dbd648b9f8e447c6518133d8580" +dependencies = [ + "windows-collections", + "windows-core", + "windows-future", + "windows-numerics", +] + +[[package]] +name = "windows-collections" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23b2d95af1a8a14a3c7367e1ed4fc9c20e0a26e79551b1454d72583c97cc6610" +dependencies = [ + "windows-core", +] + +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-future" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1d6f90251fe18a279739e78025bd6ddc52a7e22f921070ccdc67dde84c605cb" +dependencies = [ + "windows-core", + "windows-link", + "windows-threading", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "windows-link" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-numerics" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e2e40844ac143cdb44aead537bbf727de9b044e107a0f1220392177d15b0f26" +dependencies = [ + "windows-core", + "windows-link", +] + +[[package]] +name = "windows-registry" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" +dependencies = [ + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link", +] + [[package]] name = "windows-sys" version = "0.52.0" @@ -1715,6 +3219,15 @@ dependencies = [ "windows_x86_64_msvc 0.53.1", ] +[[package]] +name = "windows-threading" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3949bd5b99cafdf1c7ca86b43ca564028dfe27d66958f2470940f73d86d75b37" +dependencies = [ + "windows-link", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" @@ -1817,12 +3330,247 @@ version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap", + "prettyplease", + "syn", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + [[package]] name = "writeable" version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" +[[package]] +name = "xet-client" +version = "1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e1e496dcbe6a09017acdfaf48e1a646735e7ff5b2a49e2c7e081cca77a59bc8" +dependencies = [ + "anyhow", + "async-trait", + "base64", + "bytes", + "clap", + "crc32fast", + "futures", + "http", + "hyper", + "lazy_static", + "more-asserts", + "rand 0.10.1", + "redb", + "reqwest", + "reqwest-middleware", + "serde", + "serde_json", + "serde_repr", + "statrs", + "tempfile", + "thiserror", + "tokio", + "tokio-retry", + "tracing", + "tracing-subscriber", + "url", + "urlencoding", + "web-time", + "xet-core-structures", + "xet-runtime", +] + +[[package]] +name = "xet-core-structures" +version = "1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb838aa8eb67d730af301584cf003caad407487606058292a6750711b603fbee" +dependencies = [ + "async-trait", + "base64", + "blake3", + "bytemuck", + "bytes", + "clap", + "countio", + "csv", + "futures", + "futures-util", + "getrandom 0.4.2", + "heapify", + "itertools", + "lazy_static", + "lz4_flex", + "more-asserts", + "rand 0.10.1", + "regex", + "safe-transmute", + "serde", + "static_assertions", + "tempfile", + "thiserror", + "tokio", + "tokio-util", + "tracing", + "uuid", + "web-time", + "xet-runtime", +] + +[[package]] +name = "xet-data" +version = "1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67fd409bef621411a9d9013798540bb8036cb2678f03ab39af89a5e88034ed8c" +dependencies = [ + "anyhow", + "async-trait", + "bytes", + "chrono", + "clap", + "gearhash", + "http", + "itertools", + "lazy_static", + "more-asserts", + "rand 0.10.1", + "serde", + "serde_json", + "sha2", + "tempfile", + "thiserror", + "tokio", + "tokio-util", + "tracing", + "url", + "uuid", + "walkdir", + "xet-client", + "xet-core-structures", + "xet-runtime", +] + +[[package]] +name = "xet-runtime" +version = "1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15d8f121c33866f7648b737abe70d0e2dd9c0af4ffdd7219207531d0283aa63d" +dependencies = [ + "anyhow", + "async-trait", + "bytes", + "chrono", + "colored", + "const-str", + "ctor", + "dirs", + "futures", + "git-version", + "humantime", + "konst", + "lazy_static", + "libc", + "more-asserts", + "oneshot", + "pin-project", + "rand 0.10.1", + "reqwest", + "serde", + "serde_json", + "shellexpand", + "sysinfo", + "thiserror", + "tokio", + "tokio-util", + "tracing", + "tracing-appender", + "tracing-subscriber", + "whoami", + "winapi", +] + [[package]] name = "yoke" version = "0.8.2" diff --git a/Cargo.toml b/Cargo.toml index a483309..b844b9a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,6 +2,7 @@ name = "sparseio" authors = ["Matthew Hambrecht (hambrechtmatt@gmail.com)"] version = "0.0.1" +rust-version = "1.94.0" edition = "2024" description = "A library for coordinating sparse, out-of-order byte-range fetching and materialization of large objects." documentation = "https://docs.rs/sparseio" @@ -13,34 +14,75 @@ readme = "README.md" license = "Apache-2.0" [features] -debug = ["utils"] -file = ["dep:libc", "utils"] -http = ["dep:reqwest", "utils"] -utils = ["dep:tempfile", "dep:tracing-subscriber", "dep:tracing"] +# For debug test harnesses users can use to test their trait implementations +debug = ["test-utils"] + +# Reader/Writer trait implementations +impl-file = ["dep:libc", "test-utils"] +impl-opendal = ["dep:opendal"] +metadata-memory = [] + +# Example deps +example-huggingface = ["opendal-huggingface"] + +# Feature groups (shouldn't be called directly) +opendal-huggingface = [ + "impl-opendal", + "opendal/executors-tokio", + "opendal/reqwest-rustls-tls", + "opendal/services-huggingface", +] +test-utils = ["dep:tempfile", "dep:tracing-subscriber", "dep:tracing"] [package.metadata.docs.rs] -features = ["debug", "file", "http"] +features = ["debug", "impl-file", "impl-opendal", "metadata-memory"] [[example]] name = "file_to_file" path = "examples/file_to_file.rs" -required-features = ["file"] +required-features = ["impl-file", "metadata-memory"] + +[[example]] +name = "hf_file_bench" +path = "examples/hf_file_bench.rs" +required-features = ["example-huggingface", "impl-file", "metadata-memory"] [dependencies] -tokio = { version = "1.51.0", features = ["sync", "macros", "fs", "io-util", "rt-multi-thread", "time"] } +tokio = { version = "1.51.0", features = [ + "sync", + "macros", + "fs", + "io-util", + "rt-multi-thread", + "time", +] } futures = "0.3.32" bytes = "1.11.1" +bincode = { version = "2", features = ["serde"] } +hex = "0.4" libc = { version = "0.2", optional = true } -reqwest = { version = "0.12", optional = true, default-features = false, features = ["rustls-tls"] } -tracing = { version = "0.1.44", default-features = false, features = ["std"], optional = true } +opendal = { version = "0.56", optional = true, default-features = false } +serde = { version = "1", features = ["derive"] } +sha2 = "0.10" +tracing = { version = "0.1.44", default-features = false, features = [ + "std", +], optional = true } tempfile = { version = "3.27.0", optional = true } -tracing-subscriber = { version = "0.3.23", features = ["fmt", "ansi"], optional = true } +tracing-subscriber = { version = "0.3.23", features = [ + "fmt", + "ansi", +], optional = true } [dev-dependencies] clap = { version = "4.6.0", features = ["derive"] } async-stream = "0.3.6" rand = "0.8.5" -mockito = "1.7.2" tempfile = "3.27.0" tracing = { version = "0.1.44", default-features = false, features = ["std"] } tracing-subscriber = { version = "0.3.23", features = ["fmt", "ansi"] } +opendal = { version = "0.56", default-features = false, features = [ + "services-memory", +] } +reqwest = { version = "0.13.3", default-features = false, features = [ + "rustls", +] } diff --git a/README.md b/README.md index 7ccab0c..cb12f68 100644 --- a/README.md +++ b/README.md @@ -49,28 +49,27 @@ SparseIO models this as: - Pluggable backends via `Reader` and `Writer` traits. - Optional source implementations in `sources` (feature-gated). -## Current Feature Flags +## Current Sample Backends + +- `impl-file`: file-backed `Reader`/`Writer` implementations. +- `impl-opendal`: OpenDAL-backed Reader integration. + +- `metadata-memory`: in-memory metadata storage for single-process use. -- `file`: file-backed `Reader`/`Writer` implementations. -- `http`: reqwest-backed HTTP range-based `Reader` implementation. ## Quickstart -Run the file-to-file sparse example: +Run the local-file to file-cache example: ```bash -cargo run --example file_to_file --features file -- \ +cargo run --example file_to_file --features impl-file,metadata-memory -- \ --src target/manual/file-to-file-src.bin \ - --dst target/manual/file-to-file-dst.bin \ + --dst target/manual/file-to-file-cache \ --source-len 8388608 \ - --chunk-size 262144 \ - --fill-percent 35 + --chunk-size 262144 ``` -The example intentionally materializes randomized chunk offsets first, then verifies: - -- full fill => destination matches source byte-for-byte, -- partial fill => written chunks match source and unwritten regions remain zeroed. +The example generates a local source file, reads a few chunk-aligned offsets into a file-cache directory, and then reopens the cache to show that metadata restores the original chunk size without having to cache the entire object. See: `examples/file_to_file.rs` and `examples/file_to_file.md`. @@ -78,21 +77,24 @@ See: `examples/file_to_file.rs` and `examples/file_to_file.md`. ```rust use std::sync::Arc; -use sparseio::Builder; +use opendal::Operator; async fn demo() -> std::io::Result<()> { - // HTTP File -> Sparse Local File - let reader = sparseio::sources::http::Reader::new("https://stuff.mit.edu/afs/sipb/contrib/pi/pi-billion.txt"); - let writer = sparseio::sources::file::Writer::new("pi.txt"); + // HTTP File -> File Cache Directory + let operator = Operator::from_uri(operator, "https://stuff.mit.edu"); + let reader = sparseio::sources::opendal::Reader::new( + "afs/sipb/contrib/pi/pi-billion.txt", + ); + let writer = sparseio::sources::file::Writer::new("pi-cache"); + let metadata = sparseio::metadata::memory::MemoryMetadata::new("pi-cache.metadata.bin", 1)?; - let io = Arc::new( - Builder::new() + let io = sparseio::Builder::new() .chunk_size(1 * 1024) + .metadata(metadata) .reader(reader) .writer(writer) .build() - .await? - ); + .await?; // Get a viewer into the Sparse store let mut viewer = io.viewer(); diff --git a/docs/CI.md b/docs/CI.md index ee31c5a..41f549a 100644 --- a/docs/CI.md +++ b/docs/CI.md @@ -55,31 +55,6 @@ This catches missing imports, cfg mistakes, and feature-gating regressions witho The unit workflow focuses on fast correctness checks that do not require the broader integration feature matrix. -### Unit Tests - -- `cargo nextest run --features utils` runs the main unit-oriented test set with the `utils` feature enabled. - -### Debug Harness Tests - -- `cargo nextest run --features utils,debug` runs tests that require the debug harness feature set. - -### Doc Tests - -- `cargo test --doc --all-features` executes Rust documentation tests across the full feature set. - ## Integration Workflow The integration workflow exercises feature-backed behavior and the broader end-to-end test matrix. - -### File Feature Integration Tests - -- `cargo nextest run --features utils,file` runs integration coverage for the file-backed reader and writer implementation. - -### HTTP Feature Integration Tests - -- `cargo nextest run --features utils,http` runs integration coverage for the HTTP reader implementation. - -### Full Feature Matrix - -- `cargo nextest run --all-features` runs the broadest integration-oriented test configuration. -- This acts as the final end-to-end feature-combination check inside the integration workflow. diff --git a/examples/common/mod.rs b/examples/common/mod.rs index ef5f3db..8635c2a 100644 --- a/examples/common/mod.rs +++ b/examples/common/mod.rs @@ -1,2 +1,3 @@ pub mod sparse_fill_visualizer; pub mod sparse_materialization; +pub mod utils; diff --git a/examples/common/sparse_fill_visualizer.rs b/examples/common/sparse_fill_visualizer.rs index 837967d..6a163cc 100644 --- a/examples/common/sparse_fill_visualizer.rs +++ b/examples/common/sparse_fill_visualizer.rs @@ -1,8 +1,8 @@ use std::collections::HashSet; -/// Visualizes how extents of a sparse object are being filled by rendering a horizontal bar. Each character -/// in the bar represents a portion of the object, with different symbols indicating the fill status. -pub fn render_sparse_fill_bar(filled_offsets: &HashSet, chunk_offsets: &[usize], width: usize) -> String { +/// Visualizes how a file cache fills chunk coverage by rendering a horizontal bar. Each character in the bar +/// represents a portion of the object, with different symbols indicating the cached fraction. +pub fn render_file_fill_bar(filled_offsets: &HashSet, chunk_offsets: &[usize], width: usize) -> String { if chunk_offsets.is_empty() { return String::new(); } diff --git a/examples/common/sparse_materialization.rs b/examples/common/sparse_materialization.rs index 954a594..3ec0b71 100644 --- a/examples/common/sparse_materialization.rs +++ b/examples/common/sparse_materialization.rs @@ -10,10 +10,10 @@ use std::time::Duration; use clap::Args; use rand::Rng; -use rand::seq::SliceRandom; use tokio::time::sleep; -use crate::common::sparse_fill_visualizer::render_sparse_fill_bar; +use crate::common::sparse_fill_visualizer::render_file_fill_bar; +use crate::common::utils::shuffle_offsets; #[derive(Args, Debug, Clone)] /// CLI options shared by sparse materialization examples. @@ -138,7 +138,7 @@ where ); println!("randomized read points (selected): {:?}", offsets); println!( - "sparse map width={} sleep={}ms between steps", + "file cache map width={} sleep={}ms between steps", config.options.progress_width, config.options.sleep_ms ); @@ -153,15 +153,15 @@ where let progress_step = index + 1; let progress_percent = (progress_step as f64 * 100.0) / offsets.len() as f64; - let sparse_fill_bar = render_sparse_fill_bar(&filled_offsets, &logical_offsets, config.options.progress_width); + let file_fill_bar = render_file_fill_bar(&filled_offsets, &logical_offsets, config.options.progress_width); println!( "filled chunk {} from requested offset {} -> normalized {} ({} bytes)", index, offset, normalized_offset, chunk_len ); println!( - "sparse fill map [{}] {:>6.2}% ({}/{})", - sparse_fill_bar, + "file cache fill map [{}] {:>6.2}% ({}/{})", + file_fill_bar, progress_percent, filled_offsets.len(), logical_offsets.len() @@ -211,11 +211,3 @@ fn jittered_offsets(offsets: &[usize], chunk_size: usize, len: }) .collect() } - -/// Randomizes chunk ordering in-place. -fn shuffle_offsets(offsets: &mut [usize], rng: &mut R) { - if offsets.len() <= 1 { - return; - } - offsets.shuffle(rng); -} diff --git a/examples/common/utils.rs b/examples/common/utils.rs new file mode 100644 index 0000000..4bc0f68 --- /dev/null +++ b/examples/common/utils.rs @@ -0,0 +1,33 @@ +use rand::Rng; +use rand::seq::SliceRandom; + +/// Randomizes offset ordering in-place. +pub fn shuffle_offsets(offsets: &mut [usize], rng: &mut R) { + if offsets.len() <= 1 { + return; + } + offsets.shuffle(rng); +} + +#[cfg(test)] +mod tests { + use rand::SeedableRng; + use rand::rngs::StdRng; + + use super::shuffle_offsets; + + #[test] + fn shuffle_offsets_preserves_offsets_and_changes_seeded_order() { + let original = vec![0, 4, 8, 12, 16, 20, 24, 28]; + let mut offsets = original.clone(); + let mut rng = StdRng::seed_from_u64(42); + + shuffle_offsets(&mut offsets, &mut rng); + + assert_ne!(offsets, original); + + let mut sorted = offsets; + sorted.sort_unstable(); + assert_eq!(sorted, original); + } +} diff --git a/examples/file_to_file.md b/examples/file_to_file.md index 5f728b9..c70bd45 100644 --- a/examples/file_to_file.md +++ b/examples/file_to_file.md @@ -1,37 +1,20 @@ -# `file_to_file` Example +# file_to_file Example -`examples/file_to_file.rs` demonstrates sparse, out-of-order materialization from a source filesystem file into a destination filesystem file using `sparseio`. -It uses a storage-agnostic orchestrator in `examples/common/sparse_materialization.rs` and passes file-specific callbacks. +`examples/file_to_file.rs` demonstrates reading selected ranges from a local file while populating a filesystem file cache. ## What It Does -1. Parses CLI flags for source/destination paths and sparse materialization behavior. -2. Optionally generates a deterministic source file (`--generate-source`). -3. Builds `SparseIO` with `file::Reader` (an implementation of the `Reader` trait with `tokio::fs`) + `file::Writer` (an implementation of the `Writer` trait with `tokio::fs`). -4. Randomizes chunk read points and materializes only a selected percentage (`--fill-percent`). -5. Passes a per-step callback that computes file-specific diagnostics (logical size, allocated size, hole checks). -6. Verifies output: - - Full fill: destination bytes must match source bytes. - - Partial fill: written chunks must match source, unwritten chunks must still read as zeroes/null. +1. Optionally generates a deterministic local source file. +2. Builds `SparseIO` with `file::Reader` for upstream reads and `file::Writer` for cache storage. +3. Reads a few chunk-aligned offsets to populate the cache directory. +4. Reopens the same cache directory to show metadata-driven chunk-size reuse. ## Run It ```bash -cargo run --example file_to_file --features file -- \ +cargo run --example file_to_file --features impl-file,metadata-memory -- \ --src target/manual/file-to-file-src.bin \ - --dst target/manual/file-to-file-dst.bin \ + --dst target/manual/file-to-file-cache \ --source-len 8388608 \ - --chunk-size 262144 \ - --fill-percent 35 \ - --sleep-ms 0 \ - --progress-width 32 + --chunk-size 262144 ``` - -## Useful Flags - -- `--generate-source` (`true` by default): Create deterministic source content. -- `--pre-size-dst`: Pre-allocate logical destination length before sparse writes. -- `--chunk-size`: Chunk size used by `SparseIO` and verification logic. -- `--fill-percent`: Percent of chunk offsets to materialize (random order). -- `--sleep-ms`: Delay between materialization steps. -- `--progress-width`: Width of the ASCII sparse map. diff --git a/src/common/chunks.rs b/src/common/chunks.rs new file mode 100644 index 0000000..d27af32 --- /dev/null +++ b/src/common/chunks.rs @@ -0,0 +1,51 @@ +use std::io::{Error, ErrorKind, Result}; + +/// Maximum chunk size accepted by SparseIO metadata and builders. +pub const MAX_CHUNK_SIZE: usize = 1024 * 1024 * 1024; // 1 GiB + +/// Converts a chunk index into its absolute byte offset. +/// +/// Returns an error if the multiplication would overflow `usize`. +pub fn chunk_offset(chunk_size: usize, chunk_index: usize) -> Result { + chunk_index + .checked_mul(chunk_size) + .ok_or_else(|| invalid_data("metadata chunk offset exceeds usize")) +} + +/// Converts an aligned byte offset into its chunk index. +/// +/// Returns an error if `offset` is not aligned to `chunk_size`. +pub fn chunk_index(offset: usize, chunk_size: usize) -> Result { + if !offset.is_multiple_of(chunk_size) { + return Err(invalid_data("metadata chunk offset is not aligned to chunk_size")); + } + Ok(offset / chunk_size) +} + +/// Returns the byte length of the chunk that starts at `offset`. +/// +/// The final chunk of an object can be shorter than `chunk_size`. +pub(crate) fn expected_chunk_len(chunk_size: usize, object_len: usize, offset: usize) -> Result { + if offset >= object_len { + return Err(invalid_data("metadata chunk offset is at or beyond EOF")); + } + Ok(chunk_size.min(object_len - offset)) +} + +/// Returns the byte length of the chunk identified by `chunk_index`. +pub(crate) fn expected_chunk_len_for_index(chunk_size: usize, object_len: usize, chunk_index: usize) -> Result { + expected_chunk_len(chunk_size, object_len, chunk_offset(chunk_size, chunk_index)?) +} + +/// Returns the number of chunks required to cover an object of `object_len`. +pub(crate) fn chunk_count_for_len(chunk_size: usize, object_len: usize) -> usize { + if object_len == 0 { + 0 + } else { + object_len.div_ceil(chunk_size) + } +} + +pub(crate) fn invalid_data(message: impl Into) -> Error { + Error::new(ErrorKind::InvalidData, message.into()) +} diff --git a/src/common/codec.rs b/src/common/codec.rs new file mode 100644 index 0000000..fa60806 --- /dev/null +++ b/src/common/codec.rs @@ -0,0 +1,24 @@ +use std::io::Result; + +use bytes::Bytes; +use serde::{Serialize, de::DeserializeOwned}; + +/// Helper function to serialize a generic data-type to a [`Bytes`] object. +pub fn encode_value(value: &V) -> Result { + let encoded = bincode::serde::encode_to_vec(value, bincode::config::standard()).map_err(|err| { + std::io::Error::new(std::io::ErrorKind::InvalidData, format!("failed to encode metadata value: {err}")) + })?; + Ok(Bytes::from(encoded)) +} + +/// Helper function to deserialize a u8 slice to a typed object. +pub fn decode_value(data: &[u8]) -> Result { + let (value, consumed): (V, usize) = + bincode::serde::decode_from_slice(data, bincode::config::standard()).map_err(|err| { + std::io::Error::new(std::io::ErrorKind::InvalidData, format!("failed to decode metadata value: {err}")) + })?; + if consumed != data.len() { + return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, "metadata value has trailing bytes")); + } + Ok(value) +} diff --git a/src/common/mod.rs b/src/common/mod.rs new file mode 100644 index 0000000..bb6f49a --- /dev/null +++ b/src/common/mod.rs @@ -0,0 +1,2 @@ +pub mod chunks; +pub mod codec; diff --git a/src/coverage.rs b/src/coverage.rs index 630364b..edd9975 100644 --- a/src/coverage.rs +++ b/src/coverage.rs @@ -8,6 +8,9 @@ use std::collections::BTreeMap; +use crate::metadata::ChunkRecord; + +#[derive(Clone)] pub(crate) struct Coverage { store: BTreeMap, } @@ -17,6 +20,19 @@ impl Coverage { Self { store: BTreeMap::new() } } + /// Reconstructs coverage from persisted chunk records. + pub(crate) fn from_chunk_records(chunk_size: usize, content_len: usize, chunks: &[ChunkRecord]) -> Self { + let mut coverage = Self::new(); + for chunk in chunks { + let offset = crate::metadata::chunk_offset(chunk_size, chunk.chunk_index) + .expect("validated metadata chunk index should convert to an offset"); + let length = crate::metadata::expected_chunk_len_for_index(chunk_size, content_len, chunk.chunk_index) + .expect("validated metadata chunk index should derive a chunk length"); + coverage.insert(offset, length); + } + coverage + } + /// Gets a prior chunk that starts before or at the target offset. pub(crate) fn get(&self, target: usize) -> Option<(usize, usize)> { let prior = self.store.range(..=target).next_back(); @@ -28,6 +44,11 @@ impl Coverage { let end = offset.saturating_add(length); self.store.insert(offset, end); } + + /// Removes a tracked chunk by its starting offset. + pub(crate) fn remove(&mut self, offset: usize) { + self.store.remove(&offset); + } } #[cfg(test)] diff --git a/src/metadata/mod.rs b/src/metadata/mod.rs new file mode 100644 index 0000000..920aa81 --- /dev/null +++ b/src/metadata/mod.rs @@ -0,0 +1,54 @@ +mod spec; + +// SparseIO shipped metadata trait example implementations. +#[cfg(feature = "metadata-memory")] +pub mod memory; + +use serde::Serialize; +use serde::de::DeserializeOwned; +use std::io::Result; + +pub(crate) use crate::common::chunks::{ + MAX_CHUNK_SIZE, chunk_index, chunk_offset, expected_chunk_len, expected_chunk_len_for_index, +}; +pub(crate) use spec::checksum; +pub(crate) use spec::{ChunkRecord, MetadataSpec}; + +/// Trait describing how SparseIO stores metadata keys and values. +/// +/// Implementations are intentionally dumb key/value stores. SparseIO owns the +/// meaning of every key, chunk mapping, and refcount entry layered on top. +pub trait Metadata: Send + Sync { + /// Set a string key to an arbitrary value + fn set(&mut self, key: &str, value: V) -> impl std::future::Future> + Send + where + V: Serialize + Send; + + /// Retrieve a typed object from the Metadata-defined KV store. + /// + /// Contract must ensure that the retrieved object is deserializable back + /// to it's insertion type. + fn get(&self, key: &str) -> impl std::future::Future>> + Send + where + V: DeserializeOwned + Send; + + /// Prefix-based key search into Metadata-defined KV store. + /// + /// Contract must ensure that the retrieved object is deserializable back + /// to it's insertion type. + fn get_by_prefix( + &self, + prefix: &str, + ) -> impl std::future::Future>> + Send + where + V: DeserializeOwned + Send; + + /// Removes a key from the defined KV store. + fn delete(&mut self, key: &str) -> impl std::future::Future> + Send; +} + +/// Internal helper for verifying the validity of a SparseIO instance's +/// [`MetadataSpec`]. +pub(crate) fn validate_spec(spec: &MetadataSpec) -> Result<()> { + spec::validate_spec(spec) +} diff --git a/src/metadata/spec.rs b/src/metadata/spec.rs new file mode 100644 index 0000000..fd490d9 --- /dev/null +++ b/src/metadata/spec.rs @@ -0,0 +1,94 @@ +use std::io::Result; + +use serde::{Deserialize, Serialize}; +use sha2::{Digest, Sha256}; + +use crate::common::chunks::{MAX_CHUNK_SIZE, chunk_count_for_len, invalid_data}; + +pub(crate) const FORMAT_VERSION: u16 = 1; // Increment to denote breaking changes +const MAX_METADATA_BYTES: usize = 16 * 1024 * 1024; // 16 MiB, 128 KiB chunks ≈ ~65k chunks (64 GiB objects) + +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +// Individual chunk record in the metadata spec, representing a single cached chunk of the +// object and its hash. +pub struct ChunkRecord { + pub chunk_index: usize, + pub sha256: [u8; 32], +} + +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +// The encoded metadata format, which is what gets serialized and deserialized to recover the +// cached state of an object. +// +// **Note: MetadataSpecs are not portable across systems different with different bitnesses. +pub struct MetadataSpec { + pub version: u16, + pub source_identity: String, + pub content_len: usize, + pub chunk_size: usize, + pub chunks: Vec, +} + +/// Computes the SHA-256 checksum of a given buffer. +pub fn checksum(data: &[u8]) -> [u8; 32] { + Sha256::digest(data).into() +} + +pub fn validate_spec(spec: &MetadataSpec) -> Result<()> { + validate_chunk_size(spec.chunk_size)?; + validate_chunk_count(spec.chunk_size, spec.content_len, spec.chunks.len())?; + + let mut last = None; + for chunk in &spec.chunks { + validate_chunk_record(spec.chunk_size, spec.content_len, chunk)?; + if last.is_some_and(|last| chunk.chunk_index <= last) { + return Err(invalid_data("metadata chunks must be sorted and unique")); + } + last = Some(chunk.chunk_index); + } + + Ok(()) +} + +/// Validates that the chunk size is within acceptable bounds +/// (greater than zero and less than or equal to the maximum). +fn validate_chunk_size(chunk_size: usize) -> Result<()> { + if chunk_size == 0 { + return Err(invalid_data("metadata chunk_size must be greater than zero")); + } + if chunk_size > MAX_CHUNK_SIZE { + return Err(invalid_data("metadata chunk_size exceeds maximum")); + } + Ok(()) +} + +/// Validates that the number of chunks in the spec is sufficient to cover the content length of the +/// object, given the chunk size, and that there are no extraneous chunks beyond what would be needed +/// to cover the object. +fn validate_chunk_count(chunk_size: usize, object_len: usize, chunk_count: usize) -> Result<()> { + let max_chunks: usize = chunk_count_for_len(chunk_size, object_len); + if chunk_count > max_chunks { + return Err(invalid_data("metadata chunk count exceeds object length")); + } + Ok(()) +} + +/// Validates that a chunk record is well-formed and within the bounds of the object as defined by +/// the content length and chunk size. +fn validate_chunk_record(chunk_size: usize, object_len: usize, chunk: &ChunkRecord) -> Result<()> { + let max_chunks = chunk_count_for_len(chunk_size, object_len); + if chunk.chunk_index >= max_chunks { + return Err(invalid_data("metadata chunk index is at or beyond EOF")); + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use crate::common::chunks::expected_chunk_len_for_index; + + #[test] + fn final_chunk_length_is_derived_from_index() { + assert_eq!(expected_chunk_len_for_index(16, 20, 1).expect("final chunk length should derive"), 4); + } +} diff --git a/src/shared.rs b/src/shared.rs index 1cd0601..141c083 100644 --- a/src/shared.rs +++ b/src/shared.rs @@ -47,3 +47,4 @@ impl std::error::Error for SharedIoError {} pub(crate) type SharedChunk = Shared>>; pub(crate) const DEFAULT_CHUNK_SIZE: usize = 128 * 1024; // 128 KiB +pub(crate) const DEFAULT_PREFETCH_CHUNKS: usize = 8; // Default readahead (be congisant for TCP slow start) diff --git a/src/sources/file.rs b/src/sources/file.rs index 99857f5..cfabec0 100644 --- a/src/sources/file.rs +++ b/src/sources/file.rs @@ -1,28 +1,11 @@ -//! File-backed source implementations. -//! -//! This module provides two small building blocks for file-based workflows: -//! [`Reader`], which implements [`crate::Reader`] for reading byte -//! ranges from a local file, and [`Writer`], which implements -//! [`crate::Writer`] while materializing logical extents into a sparse -//! destination file (). +//! Local file reader and filesystem-backed cache writer implementation. -use std::collections::BTreeMap; -use std::io::SeekFrom; -#[cfg(any(target_os = "linux", target_os = "macos"))] -use std::os::fd::{AsRawFd, RawFd}; -#[cfg(any(target_os = "linux", target_os = "macos"))] -use std::os::unix::fs::MetadataExt; +use std::io::{ErrorKind, SeekFrom}; use std::path::{Path, PathBuf}; use bytes::Bytes; use tokio::io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; -/// Reads byte ranges from a local file using Tokio file I/O. -/// -/// `Reader` is intended to be used as the [`crate::Reader`] for -/// [`crate::SparseIO`]. Each `read_at` call opens the file, seeks to the -/// requested offset, fills the provided buffer, and returns the number of bytes -/// written into it. #[derive(Clone, Debug)] pub struct Reader { path: PathBuf, @@ -41,14 +24,10 @@ impl Reader { } impl crate::Reader for Reader { - /// Reads at most `buffer.len()` bytes starting at `offset`. async fn read_at(&self, offset: usize, buffer: &mut [u8]) -> std::io::Result { let mut file = tokio::fs::File::open(&self.path).await?; file.seek(SeekFrom::Start(offset as u64)).await?; - // Read in a loop to handle short reads, which can occur because - // there's no guarantee from AsyncReadExt::read that it will fill - // the entire buffer in one call. let mut total_read = 0usize; while total_read < buffer.len() { let read_len = file.read(&mut buffer[total_read..]).await?; @@ -61,7 +40,6 @@ impl crate::Reader for Reader { Ok(total_read) } - /// Returns the source file length in bytes. async fn len(&self) -> std::io::Result { let size = tokio::fs::metadata(&self.path).await?.len(); usize::try_from(size) @@ -69,528 +47,57 @@ impl crate::Reader for Reader { } } -/// Stores cached file extents in a sparse destination file. -/// -/// The file itself is used to reserve the logical address space. Extent metadata -/// is tracked in-memory so the current [`crate::Writer`] trait can answer `read_extent` -/// requests efficiently. This makes the type useful as a simple example or -/// local-materialization target, but it does not currently persist extent -/// metadata across process restarts. -#[derive(Default)] +#[derive(Debug, Default)] pub struct Writer { dst: PathBuf, - extents: BTreeMap, -} - -#[derive(Clone, Copy, Debug)] -/// Internal metadata struct for tracking extents in `Writer`. -/// Each extent is defined by its starting offset and length. -struct Extent { - offset: usize, - length: usize, } impl Writer { - /// Creates a sparse destination file store rooted at `path`. - pub fn new(path: impl Into) -> Self { - Self { - dst: path.into(), - extents: BTreeMap::new(), - } + /// Creates a cache writer rooted at `dst`. + pub fn new(dst: impl Into) -> Self { + Self { dst: dst.into() } } - /// Returns the destination path used by this store. + /// Returns the destination directory used by this writer. pub fn path(&self) -> &Path { &self.dst } - #[cfg(any(target_os = "linux", target_os = "macos"))] - /// Punches out the storage backing a single extent while keeping logical size. - async fn punch_extent(&self, extent: Extent) -> std::io::Result<()> { - if extent.length == 0 { - return Ok(()); - } - - let file = tokio::fs::OpenOptions::new().write(true).open(&self.dst).await?; - let metadata = file.metadata().await?; - let logical_len = usize::try_from(metadata.len()) - .map_err(|_| std::io::Error::new(std::io::ErrorKind::InvalidData, "file length exceeds usize"))?; - let end = extent.offset.saturating_add(extent.length).min(logical_len); - let block_size = metadata.blksize().max(1) as usize; - let fd = file.as_raw_fd(); - - if let Err(err) = punch_hole_aligned(fd, extent.offset, end, block_size) { - if is_unsupported_punch_error(&err) { - return Ok(()); - } - return Err(err); - } - - Ok(()) - } - - #[cfg(not(any(target_os = "linux", target_os = "macos")))] - /// No-op hole punch fallback on unsupported platforms. - async fn punch_extent(&self, _extent: Extent) -> std::io::Result<()> { - Ok(()) - } -} - -#[cfg(any(target_os = "linux", target_os = "macos"))] -/// Punches a hole for `[start, end)` after filesystem block alignment. -fn punch_hole_aligned(fd: RawFd, start: usize, end: usize, block_size: usize) -> std::io::Result<()> { - if end <= start { - return Ok(()); - } - - let aligned_start = align_up(start, block_size); - let aligned_end = align_down(end, block_size); - if aligned_end <= aligned_start { - return Ok(()); - } - - punch_hole(fd, aligned_start, aligned_end - aligned_start) -} - -#[cfg(any(target_os = "linux", target_os = "macos"))] -/// Attempts hole punching and suppresses unsupported-filesystem errors. -fn punch_hole_best_effort(fd: RawFd, start: usize, end: usize, block_size: usize) -> std::io::Result<()> { - if let Err(err) = punch_hole_aligned(fd, start, end, block_size) { - if is_unsupported_punch_error(&err) { - return Ok(()); - } - return Err(err); - } - Ok(()) -} - -#[cfg(any(target_os = "linux", target_os = "macos"))] -/// Aligns `value` upward to the nearest `alignment` boundary. -fn align_up(value: usize, alignment: usize) -> usize { - if alignment <= 1 { - return value; - } - let rem = value % alignment; - if rem == 0 { - value - } else { - value.saturating_add(alignment - rem) - } -} - -#[cfg(any(target_os = "linux", target_os = "macos"))] -/// Aligns `value` downward to the nearest `alignment` boundary. -fn align_down(value: usize, alignment: usize) -> usize { - if alignment <= 1 { - return value; + fn key_path(&self, key: &str) -> PathBuf { + self.dst.join(hex::encode(key.as_bytes())) } - value - (value % alignment) } impl crate::Writer for Writer { - /// Writes `data` at `offset` and records the resulting extent. - async fn create_extent(&mut self, offset: usize, data: bytes::Bytes) -> std::io::Result<()> { - let length = data.len(); - let end = offset - .checked_add(length) - .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::InvalidInput, "extent exceeds usize"))?; - + async fn set_cache(&mut self, key: &str, value: &[u8]) -> std::io::Result<()> { + tokio::fs::create_dir_all(&self.dst).await?; let mut file = tokio::fs::OpenOptions::new() .create(true) .write(true) - .truncate(false) - .open(&self.dst) + .truncate(true) + .open(self.key_path(key)) .await?; - - file.seek(SeekFrom::Start(offset as u64)).await?; - file.write_all(data.as_ref()).await?; + file.write_all(value).await?; file.flush().await?; - self.extents.insert(offset, Extent { offset, length }); - - #[cfg(any(target_os = "linux", target_os = "macos"))] - { - let metadata = file.metadata().await?; - let logical_len = usize::try_from(metadata.len()) - .map_err(|_| std::io::Error::new(std::io::ErrorKind::InvalidData, "file length exceeds usize"))?; - let block_size = metadata.blksize().max(1) as usize; - let fd = file.as_raw_fd(); - - // Punch only local gaps around this extent instead of scanning all extents. - let prev_end = self - .extents - .range(..offset) - .next_back() - .map(|(_, extent)| extent.offset.saturating_add(extent.length)) - .unwrap_or(0); - let next_start = self - .extents - .range(offset.saturating_add(1)..) - .next() - .map(|(_, extent)| extent.offset); - - if prev_end < offset { - punch_hole_best_effort(fd, prev_end, offset, block_size)?; - } - - if let Some(next_start) = next_start { - if end < next_start { - punch_hole_best_effort(fd, end, next_start, block_size)?; - } - } else if end < logical_len { - punch_hole_best_effort(fd, end, logical_len, block_size)?; - } - } - Ok(()) } - /// Reads a previously tracked extent at `offset`, or returns empty bytes. - async fn read_extent(&self, offset: usize) -> std::io::Result { - let Some(extent) = self.extents.get(&offset).copied() else { - return Ok(Bytes::new()); + async fn get_cache(&self, key: &str) -> std::io::Result> { + let mut file = match tokio::fs::File::open(self.key_path(key)).await { + Ok(file) => file, + Err(err) if err.kind() == ErrorKind::NotFound => return Ok(None), + Err(err) => return Err(err), }; - - let mut file = tokio::fs::File::open(&self.dst).await?; - file.seek(SeekFrom::Start(extent.offset as u64)).await?; - - let mut data = vec![0u8; extent.length]; - file.read_exact(&mut data).await?; - - Ok(Bytes::from(data)) - } - - /// Removes tracked extent metadata and hole-punches that extent range. - async fn delete_extent(&mut self, offset: usize) -> std::io::Result<()> { - if let Some(extent) = self.extents.remove(&offset) { - self.punch_extent(extent).await?; - } - Ok(()) - } -} - -#[cfg(target_os = "linux")] -/// Linux hole punching via `fallocate(FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)`. -fn punch_hole(fd: RawFd, offset: usize, len: usize) -> std::io::Result<()> { - if len == 0 { - return Ok(()); - } - - let offset = libc::off_t::try_from(offset) - .map_err(|_| std::io::Error::new(std::io::ErrorKind::InvalidInput, "offset exceeds off_t"))?; - let len = libc::off_t::try_from(len) - .map_err(|_| std::io::Error::new(std::io::ErrorKind::InvalidInput, "length exceeds off_t"))?; - - let result = unsafe { libc::fallocate(fd, libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE, offset, len) }; - if result == 0 { - Ok(()) - } else { - Err(std::io::Error::last_os_error()) - } -} - -#[cfg(target_os = "macos")] -/// macOS hole punching via `fcntl(F_PUNCHHOLE)`. -fn punch_hole(fd: RawFd, offset: usize, len: usize) -> std::io::Result<()> { - if len == 0 { - return Ok(()); - } - - let offset = libc::off_t::try_from(offset) - .map_err(|_| std::io::Error::new(std::io::ErrorKind::InvalidInput, "offset exceeds off_t"))?; - let len = libc::off_t::try_from(len) - .map_err(|_| std::io::Error::new(std::io::ErrorKind::InvalidInput, "length exceeds off_t"))?; - - #[repr(C)] - struct Fpunchhole { - fp_flags: libc::c_uint, - reserved: libc::c_uint, - fp_offset: libc::off_t, - fp_length: libc::off_t, - } - - const F_PUNCHHOLE: libc::c_int = 99; - let mut punch = Fpunchhole { - fp_flags: 0, - reserved: 0, - fp_offset: offset, - fp_length: len, - }; - - let result = unsafe { libc::fcntl(fd, F_PUNCHHOLE, &mut punch) }; - if result != -1 { - Ok(()) - } else { - Err(std::io::Error::last_os_error()) - } -} - -#[cfg(any(target_os = "linux", target_os = "macos"))] -/// Returns true when a hole-punch failure indicates the filesystem does not support it. -fn is_unsupported_punch_error(err: &std::io::Error) -> bool { - matches!(err.raw_os_error(), Some(code) if code == libc::ENOTSUP || code == libc::EOPNOTSUPP) -} - -#[cfg(test)] -mod tests { - use std::fs; - use std::io::{Read, Seek, Write}; - #[cfg(target_os = "linux")] - use std::os::unix::fs::MetadataExt; - use std::path::Path; - #[cfg(target_os = "macos")] - use std::process::Command; - use std::time::{SystemTime, UNIX_EPOCH}; - - use bytes::Bytes; - - use super::*; - use crate::{Reader as _, Writer as _}; - - /// Creates a unique test file path under `target/sparse-file-tests`. - fn test_file_path(name: &str) -> std::io::Result { - let dir = Path::new("target").join("sparse-file-tests"); - fs::create_dir_all(&dir)?; - - let unique = SystemTime::now() - .duration_since(UNIX_EPOCH) - .expect("system clock should be after unix epoch") - .as_nanos(); - - Ok(dir.join(format!("{name}-{unique}.dat"))) - } - - #[cfg(target_os = "linux")] - /// Returns on-disk allocated bytes for `path` on Linux. - fn allocated_bytes(path: &Path) -> std::io::Result { - Ok(fs::metadata(path)?.blocks() * 512) - } - - #[cfg(target_os = "macos")] - /// Returns on-disk allocated bytes for `path` on macOS via `du -k`. - fn allocated_bytes(path: &Path) -> Result> { - let output = Command::new("du").arg("-k").arg(path).output()?; - assert!( - output.status.success(), - "du -k failed for {} with status {:?}", - path.display(), - output.status.code() - ); - - let stdout = String::from_utf8(output.stdout)?; - let kib = stdout - .split_whitespace() - .next() - .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::InvalidData, "du output missing size column"))? - .parse::()?; - - Ok(kib * 1024) + let mut data = Vec::new(); + file.read_to_end(&mut data).await?; + Ok(Some(Bytes::from(data))) } - /// Writes a dense control file of exactly `len` bytes. - fn write_dense_control_file(path: &Path, len: usize) -> std::io::Result<()> { - let mut file = fs::File::create(path)?; - let chunk = vec![0x5Au8; 1024 * 1024]; - let mut remaining = len; - - while remaining > 0 { - let to_write = remaining.min(chunk.len()); - file.write_all(&chunk[..to_write])?; - remaining -= to_write; + async fn delete_cache(&mut self, key: &str) -> std::io::Result<()> { + match tokio::fs::remove_file(self.key_path(key)).await { + Ok(()) => Ok(()), + Err(err) if err.kind() == ErrorKind::NotFound => Ok(()), + Err(err) => Err(err), } - - file.sync_all()?; - Ok(()) - } - - #[tokio::test] - /// Verifies file-backed reader returns the requested byte range. - async fn file_reader_reads_requested_range() -> Result<(), Box> { - let path = test_file_path("reader")?; - fs::write(&path, b"hello sparseio")?; - - let reader = Reader::new(path.clone()); - let mut buffer = [0u8; 8]; - let bytes_read = reader.read_at(6, &mut buffer).await?; - - assert_eq!(bytes_read, 8); - assert_eq!(&buffer, b"sparseio"); - assert_eq!(reader.len().await?, 14); - assert_eq!(reader.path(), path.as_path()); - Ok(()) - } - - #[tokio::test] - /// Verifies reads near EOF are truncated to remaining bytes. - async fn file_reader_truncates_to_remaining_bytes() -> Result<(), Box> { - let path = test_file_path("reader-tail")?; - fs::write(&path, b"hello sparseio")?; - - let reader = Reader::new(path); - let mut buffer = [0xFFu8; 8]; - let bytes_read = reader.read_at(11, &mut buffer).await?; - - assert_eq!(bytes_read, 3); - assert_eq!(&buffer[..3], b"eio"); - assert_eq!(&buffer[3..], &[0xFF; 5]); - Ok(()) - } - - #[tokio::test] - /// Verifies reads at EOF return zero bytes. - async fn file_reader_reports_zero_bytes_at_eof() -> Result<(), Box> { - let path = test_file_path("reader-eof")?; - fs::write(&path, b"hello sparseio")?; - - let reader = Reader::new(path); - let mut buffer = [0xAAu8; 4]; - let bytes_read = reader.read_at(14, &mut buffer).await?; - - assert_eq!(bytes_read, 0); - assert_eq!(buffer, [0xAA; 4]); - Ok(()) - } - - #[tokio::test] - /// Verifies extent creation, retrieval, persistence, and deletion behavior. - async fn sparse_file_tracks_created_extents() -> Result<(), Box> { - let path = test_file_path("extent-store")?; - let mut store = Writer::new(path.clone()); - - store.create_extent(4096, Bytes::from_static(b"hole data")).await?; - - let data = store.read_extent(4096).await?; - assert_eq!(data, Bytes::from_static(b"hole data")); - assert_eq!(fs::metadata(&path)?.len(), 4105); - assert_eq!(store.path(), path.as_path()); - - let mut persisted = vec![0u8; 9]; - let mut file = fs::File::open(&path)?; - file.seek(SeekFrom::Start(4096))?; - file.read_exact(&mut persisted)?; - assert_eq!(&persisted, b"hole data"); - - store.delete_extent(4096).await?; - assert!(store.read_extent(4096).await?.is_empty()); - Ok(()) - } - - #[tokio::test] - /// Verifies bytes between disjoint extents remain zero-filled. - async fn sparse_file_preserves_zero_filled_gap_between_extents() -> Result<(), Box> { - let path = test_file_path("extent-gap")?; - let mut store = Writer::new(path.clone()); - - let front = Bytes::from_static(b"front"); - let back = Bytes::from_static(b"back"); - let back_offset = 8192; - - store.create_extent(0, front.clone()).await?; - store.create_extent(back_offset, back.clone()).await?; - - let mut file = fs::File::open(&path)?; - - let mut front_bytes = vec![0u8; front.len()]; - file.read_exact(&mut front_bytes)?; - assert_eq!(front_bytes, front.as_ref()); - - let gap_len = back_offset - front.len(); - let mut gap = vec![0u8; gap_len]; - file.read_exact(&mut gap)?; - assert!(gap.iter().all(|byte| *byte == 0)); - - let mut back_bytes = vec![0u8; back.len()]; - file.read_exact(&mut back_bytes)?; - assert_eq!(back_bytes, back.as_ref()); - assert_eq!(fs::metadata(&path)?.len(), (back_offset + back.len()) as u64); - - Ok(()) - } - - #[tokio::test] - /// Verifies sparse layout uses substantially less allocated disk than dense layout. - async fn sparse_file_uses_less_disk_than_dense_file() -> Result<(), Box> { - let sparse_path = test_file_path("sparse")?; - let dense_path = test_file_path("dense")?; - let mut store = Writer::new(sparse_path.clone()); - - let logical_len = 1024 * 1024 * 1024; - let head_len = 4 * 1024; - let tail_len = 4 * 1024; - let start = Bytes::from(vec![0xAB; head_len]); - let end = Bytes::from(vec![0xCD; tail_len]); - - store.create_extent(0, start).await?; - store.create_extent(logical_len - tail_len, end).await?; - - let logical_size = fs::metadata(&sparse_path)?.len(); - assert_eq!(logical_size, logical_len as u64); - - write_dense_control_file(&dense_path, logical_len)?; - - let sparse_bytes = allocated_bytes(&sparse_path)?; - let dense_bytes = allocated_bytes(&dense_path)?; - - assert!( - sparse_bytes < dense_bytes / 8, - "expected sparse file to use much less disk than dense files: sparse={sparse_bytes}B dense={dense_bytes}B" - ); - - Ok(()) - } - - #[tokio::test] - /// Verifies deleting a missing extent is a no-op rather than an error. - async fn delete_missing_extent_is_a_noop() -> Result<(), Box> { - let path = test_file_path("delete-missing")?; - let mut store = Writer::new(path); - - store.delete_extent(1024).await?; - store.delete_extent(1024).await?; - assert!(store.read_extent(1024).await?.is_empty()); - Ok(()) - } - - #[tokio::test] - /// Verifies same-offset writes follow last-write-wins semantics. - async fn same_offset_overwrite_is_last_write_wins() -> Result<(), Box> { - let path = test_file_path("overwrite")?; - let mut store = Writer::new(path); - - store.create_extent(2048, Bytes::from_static(b"first")).await?; - store.create_extent(2048, Bytes::from_static(b"second")).await?; - - assert_eq!(store.read_extent(2048).await?, Bytes::from_static(b"second")); - Ok(()) - } - - #[tokio::test] - /// Verifies short tail extents round-trip exactly as written. - async fn short_tail_extent_round_trips_exact_bytes() -> Result<(), Box> { - let path = test_file_path("tail")?; - let mut store = Writer::new(path); - - store.create_extent(4093, Bytes::from_static(b"tail")).await?; - assert_eq!(store.read_extent(4093).await?, Bytes::from_static(b"tail")); - Ok(()) - } - - #[tokio::test] - /// Verifies file-open and file-path failures propagate as I/O errors. - async fn file_open_and_path_error_propagation_is_preserved() -> Result<(), Box> { - let missing = test_file_path("missing-source")?; - let reader = Reader::new(&missing); - let mut buffer = [0u8; 4]; - let err = reader.read_at(0, &mut buffer).await.expect_err("missing source should fail"); - assert_eq!(err.kind(), std::io::ErrorKind::NotFound); - - let dst = test_file_path("missing-parent/subdir/dst.bin")?; - let mut store = Writer::new(&dst); - let err = store - .create_extent(0, Bytes::from_static(b"data")) - .await - .expect_err("missing parent should fail"); - assert_eq!(err.kind(), std::io::ErrorKind::NotFound); - assert!(store.read_extent(0).await?.is_empty()); - Ok(()) } } diff --git a/src/sources/http.rs b/src/sources/http.rs deleted file mode 100644 index ff26d08..0000000 --- a/src/sources/http.rs +++ /dev/null @@ -1,298 +0,0 @@ -//! HTTP range-based source implementation. -//! -//! This module provides [`Reader`], a [`crate::Reader`] trait implementation that -//! fetches byte ranges from an HTTP endpoint using [`reqwest`]. - -use std::io; -use std::time::Duration; - -use reqwest::StatusCode; - -const IDLE_POOL_TIMEOUT: Duration = Duration::from_secs(300); -const IDLE_POOL_MAX_SIZE: usize = 32; -const KEEPALIVE: Duration = Duration::from_secs(30); - -/// Reads byte ranges from an HTTP resource. -/// -/// `Reader` issues `GET` requests with a `Range` header for `read_at`, and -/// attempts to determine object length from `HEAD` `Content-Length` in `len`. -#[derive(Clone, Debug)] -pub struct Reader { - client: reqwest::Client, - url: String, - len_override: Option, -} - -impl Reader { - /// Creates a new HTTP reader for `url` with a default [`reqwest::Client`]. - pub fn new(url: impl Into) -> Self { - // Build a long-lived client so sequential range reads can reuse pooled - // connections instead of re-handshaking after short idle periods. - let client = reqwest::Client::builder() - .pool_idle_timeout(IDLE_POOL_TIMEOUT) - .pool_max_idle_per_host(IDLE_POOL_MAX_SIZE) - .tcp_keepalive(KEEPALIVE) - .build() - .unwrap_or_else(|_| reqwest::Client::new()); - - Self { - client, - url: url.into(), - len_override: None, - } - } - - /// Creates a new HTTP reader using a caller-provided [`reqwest::Client`]. - pub fn with_client(client: reqwest::Client, url: impl Into) -> Self { - Self { - client, - url: url.into(), - len_override: None, - } - } - - /// Returns a copy of this reader configured with a caller-provided object length. - /// - /// Use this when the caller already knows the object size but the server - /// does not provide reliable `Content-Length` / `Content-Range` headers. - pub fn with_len_override(mut self, len: usize) -> Self { - self.len_override = Some(len); - self - } - - /// Sets or clears the caller-provided object length override. - pub fn set_len_override(&mut self, len: Option) { - self.len_override = len; - } - - /// Returns the currently configured object length override. - pub fn len_override(&self) -> Option { - self.len_override - } - - /// Returns the source URL used by this reader. - pub fn url(&self) -> &str { - &self.url - } -} - -impl crate::Reader for Reader { - async fn read_at(&self, offset: usize, buffer: &mut [u8]) -> io::Result { - if buffer.is_empty() { - return Ok(0); - } - - let requested_end = offset - .checked_add(buffer.len().saturating_sub(1)) - .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "range end overflow"))?; - let range_header = format!("bytes={offset}-{requested_end}"); - - let mut response = self - .client - .get(&self.url) - .header(reqwest::header::RANGE, range_header) - .send() - .await - .map_err(io::Error::other)?; - - match response.status() { - StatusCode::PARTIAL_CONTENT => { - let (start, response_end, total) = response - .headers() - .get(reqwest::header::CONTENT_RANGE) - .and_then(|value| value.to_str().ok()) - .and_then(parse_content_range) - .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "missing or malformed Content-Range"))?; - - if start != offset { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - format!("HTTP range response started at {start}, expected {offset}"), - )); - } - - let expected_end = if response_end == requested_end { - requested_end - } else if let Some(total) = total { - let eof_end = total.checked_sub(1).ok_or_else(|| { - io::Error::new(io::ErrorKind::InvalidData, "invalid Content-Range total length") - })?; - if response_end == eof_end && response_end < requested_end { - response_end - } else { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - format!("HTTP range response ended at {response_end}, expected {requested_end}"), - )); - } - } else { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - format!("HTTP range response ended at {response_end}, expected {requested_end}"), - )); - }; - - let advertised_len = expected_end - .checked_sub(start) - .and_then(|len| len.checked_add(1)) - .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "invalid Content-Range length"))?; - if advertised_len > buffer.len() { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - format!( - "HTTP range response advertised {advertised_len} bytes for a {} byte request", - buffer.len() - ), - )); - } - - let copied = read_response_prefix(&mut response, buffer).await?; - if copied != advertised_len { - return Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - format!( - "HTTP range response body length {copied} did not match advertised length {advertised_len}" - ), - )); - } - Ok(copied) - }, - StatusCode::OK if offset == 0 => read_response_prefix(&mut response, buffer).await, - StatusCode::RANGE_NOT_SATISFIABLE => Ok(0), - _ => Err(io::Error::other(format!( - "unexpected HTTP status {} for ranged read at offset {offset}", - response.status() - ))), - } - } - - async fn len(&self) -> io::Result { - if let Some(len) = self.len_override { - return Ok(len); - } - - match self.client.head(&self.url).send().await { - Ok(response) if response.status().is_success() => { - if let Some(len) = response - .headers() - .get(reqwest::header::CONTENT_LENGTH) - .and_then(|value| value.to_str().ok()) - .and_then(|value| value.parse::().ok()) - { - return Ok(len); - } - }, - _ => {}, - } - - // Fall back to a tiny ranged GET for servers that omit Content-Length on HEAD. - let get = self - .client - .get(&self.url) - .header(reqwest::header::RANGE, "bytes=0-0") - .send() - .await; - if let Ok(response) = get { - if response.status() == StatusCode::PARTIAL_CONTENT { - if let Some(total) = response - .headers() - .get(reqwest::header::CONTENT_RANGE) - .and_then(|value| value.to_str().ok()) - .and_then(parse_content_range) - .and_then(|(_, _, total)| total) - { - return Ok(total); - } - } else if response.status() == StatusCode::OK { - if let Some(len) = response - .headers() - .get(reqwest::header::CONTENT_LENGTH) - .and_then(|value| value.to_str().ok()) - .and_then(|value| value.parse::().ok()) - { - return Ok(len); - } - if let Ok(bytes) = response.bytes().await { - return Ok(bytes.len()); - } - } - } - - Err(io::Error::new( - io::ErrorKind::InvalidData, - format!("failed to determine content length for {}", self.url), - )) - } -} - -async fn read_response_prefix(response: &mut reqwest::Response, buffer: &mut [u8]) -> io::Result { - let mut copied = 0usize; - - while copied < buffer.len() { - let Some(chunk) = response.chunk().await.map_err(io::Error::other)? else { - break; - }; - if chunk.is_empty() { - continue; - } - - let to_copy = (buffer.len() - copied).min(chunk.len()); - buffer[copied..copied + to_copy].copy_from_slice(&chunk[..to_copy]); - copied += to_copy; - } - - Ok(copied) -} - -fn parse_content_range(value: &str) -> Option<(usize, usize, Option)> { - let value = value.strip_prefix("bytes ")?; - let (range, total) = value.split_once('/')?; - let (start, end) = range.split_once('-')?; - let start = start.parse::().ok()?; - let end = end.parse::().ok()?; - if end < start { - return None; - } - - let total = if total == "*" { - None - } else { - Some(total.parse::().ok()?) - }; - - Some((start, end, total)) -} - -#[deprecated(note = "Use sources::http::Reader")] -pub type HttpReader = Reader; - -#[cfg(test)] -mod tests { - use super::{Reader, parse_content_range}; - use crate::Reader as _; - - /// This test keeps the override path isolated from any HTTP probing so - /// callers can bypass unreliable server metadata completely. - #[tokio::test] - async fn len_uses_override_without_network() { - let reader = Reader::new("http://127.0.0.1:1/unreachable").with_len_override(12345); - assert_eq!(reader.len().await.expect("len override should be returned"), 12345); - } - - /// This test pins the plain Content-Range parser so the HTTP reader can - /// safely interpret a server's range metadata. - #[test] - fn parse_content_range_extracts_start_end_and_total_length() { - assert_eq!(parse_content_range("bytes 0-0/99"), Some((0, 0, Some(99)))); - assert_eq!(parse_content_range("bytes 10-19/2048"), Some((10, 19, Some(2048)))); - } - - /// This test keeps malformed or wildcard totals from being misread as - /// valid lengths. - #[test] - fn parse_content_range_rejects_malformed_ranges_and_keeps_wildcard_totals_optional() { - assert_eq!(parse_content_range("bytes 0-0/*"), Some((0, 0, None))); - assert_eq!(parse_content_range("bytes 8-3/12"), None); - assert_eq!(parse_content_range("not-a-range"), None); - } -} diff --git a/src/sources/mod.rs b/src/sources/mod.rs index 799a039..134b349 100644 --- a/src/sources/mod.rs +++ b/src/sources/mod.rs @@ -1,8 +1,8 @@ -//! Implementations of the [`crate::Reader`], and [`crate::Writer`] for dealing with common use cases. Usages of +//! Implementations of the [`crate::Reader`] and [`crate::Writer`] for dealing with common use cases. Usages of //! such sources are shown in the examples, and they can be used as building blocks for more complex custom sources, //! references for building your own source, or as-is. -#[cfg(feature = "file")] +#[cfg(feature = "impl-file")] pub mod file; -#[cfg(feature = "http")] -pub mod http; +#[cfg(feature = "impl-opendal")] +pub mod opendal; diff --git a/src/sources/opendal.rs b/src/sources/opendal.rs new file mode 100644 index 0000000..daaf0f8 --- /dev/null +++ b/src/sources/opendal.rs @@ -0,0 +1,87 @@ +//! OpenDAL-backed range reader implementation. + +use std::io::{Error, ErrorKind}; + +/// Reads byte ranges from an OpenDAL operator path. +#[derive(Clone, Debug)] +pub struct Reader { + operator: opendal::Operator, + path: String, + len: usize, +} + +impl Reader { + /// Creates a reader for `path` using `operator`. + pub async fn new(operator: opendal::Operator, path: impl Into) -> std::io::Result { + /// Ensure this OpenDAL service has the minimum required capabilities + /// to be compatible with SparseIO. + let capabilities = operator.info().full_capability(); + if !capabilities.stat { + return Err(Error::new(ErrorKind::InvalidInput, "OpenDAL operator does not support stat")); + } + if !capabilities.read { + return Err(Error::new(ErrorKind::InvalidInput, "OpenDAL operator does not support read")); + } + + let path = normalize_relative_path(&path.into()); + let len = operator.stat(&path).await.map_err(map_opendal_error)?.content_length(); + let len = usize::try_from(len) + .map_err(|_| Error::new(ErrorKind::InvalidData, "OpenDAL object length exceeds usize"))?; + + Ok(Self { operator, path, len }) + } + + /// Returns the OpenDAL path used by this reader. + pub fn path(&self) -> &str { + &self.path + } +} + +impl crate::Reader for Reader { + async fn read_at(&self, offset: usize, buffer: &mut [u8]) -> std::io::Result { + if buffer.is_empty() || offset >= self.len { + return Ok(0); + } + + let end = offset + .checked_add(buffer.len()) + .map(|end| end.min(self.len)) + .ok_or_else(|| Error::new(ErrorKind::InvalidInput, "range end overflow"))?; + if end <= offset { + return Ok(0); + } + + let start = u64::try_from(offset).map_err(|_| Error::new(ErrorKind::InvalidInput, "offset exceeds u64"))?; + let end = u64::try_from(end).map_err(|_| Error::new(ErrorKind::InvalidInput, "range end exceeds u64"))?; + let data = match self.operator.read_with(&self.path).range(start..end).await { + Ok(data) => data.to_bytes(), + Err(err) if err.kind() == opendal::ErrorKind::RangeNotSatisfied => return Ok(0), + Err(err) => return Err(map_opendal_error(err)), + }; + + let copied = data.len().min(buffer.len()); + buffer[..copied].copy_from_slice(&data[..copied]); + Ok(copied) + } + + async fn len(&self) -> std::io::Result { + Ok(self.len) + } +} + +fn normalize_relative_path(path: &str) -> String { + let normalized = path + .split('/') + .filter(|segment| !segment.is_empty() && *segment != ".") + .collect::>() + .join("/"); + if normalized.is_empty() { + "/".to_owned() + } else { + normalized + } +} + +fn map_opendal_error(err: opendal::Error) -> Error { + err.into() +} diff --git a/src/utils/flaky.rs b/src/utils/flaky.rs index b291ca3..5f1fce3 100644 --- a/src/utils/flaky.rs +++ b/src/utils/flaky.rs @@ -5,9 +5,6 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use bytes::Bytes; /// Reader double that fails once for selected offsets. -/// -/// This is useful when a test needs to prove SparseIO clears in-flight state -/// after a transient upstream error and allows a later retry to succeed. #[derive(Clone)] pub struct Reader { data: Arc>, @@ -47,44 +44,40 @@ impl crate::Reader for Reader { } } -/// Writer double that fails once for selected offsets. -/// -/// This is useful when a test needs to prove SparseIO clears in-flight state -/// after a transient materialization failure and retries cleanly. +/// Writer double that fails once for selected cache keys. #[derive(Default, Clone)] pub struct Writer { - extents: Arc>>, - fail_offsets: Arc>, + entries: Arc>>, + fail_keys: Arc>, failures: Arc, } impl Writer { - /// Creates a flaky writer that fails once on any listed offset. - pub fn fail_once_at(fail_offsets: impl IntoIterator) -> Self { + pub fn fail_once_at(fail_keys: impl IntoIterator>) -> Self { Self { - extents: Arc::new(tokio::sync::Mutex::new(std::collections::BTreeMap::new())), - fail_offsets: Arc::new(fail_offsets.into_iter().collect()), + entries: Arc::new(tokio::sync::Mutex::new(std::collections::BTreeMap::new())), + fail_keys: Arc::new(fail_keys.into_iter().map(Into::into).collect()), failures: Arc::new(AtomicUsize::new(0)), } } } impl crate::Writer for Writer { - async fn create_extent(&mut self, offset: usize, data: Bytes) -> std::io::Result<()> { - if self.fail_offsets.contains(&offset) && self.failures.fetch_add(1, Ordering::SeqCst) == 0 { - return Err(std::io::Error::other(format!("transient writer failure at {offset}"))); + async fn set_cache(&mut self, key: &str, value: &[u8]) -> std::io::Result<()> { + if self.fail_keys.contains(key) && self.failures.fetch_add(1, Ordering::SeqCst) == 0 { + return Err(std::io::Error::other("transient writer failure")); } - self.extents.lock().await.insert(offset, data); + self.entries.lock().await.insert(key.to_owned(), Bytes::copy_from_slice(value)); Ok(()) } - async fn read_extent(&self, offset: usize) -> std::io::Result { - Ok(self.extents.lock().await.get(&offset).cloned().unwrap_or_else(Bytes::new)) + async fn get_cache(&self, key: &str) -> std::io::Result> { + Ok(self.entries.lock().await.get(key).cloned()) } - async fn delete_extent(&mut self, offset: usize) -> std::io::Result<()> { - self.extents.lock().await.remove(&offset); + async fn delete_cache(&mut self, key: &str) -> std::io::Result<()> { + self.entries.lock().await.remove(key); Ok(()) } } diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 891d900..0c961e7 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -1,6 +1,6 @@ //! Opt-in helper utilities for examples, tests, and downstream validation. //! -//! This module is public behind the `utils` feature, and is **not** considered part +//! This module is public behind the `test-utils` feature, and is **not** considered part //! of the public API contract. It is intended for internal use in examples and tests, //! and for downstream users to copy and adapt as needed. As such, it may change without //! warning and should not be used directly by downstream users. diff --git a/src/utils/oracle.rs b/src/utils/oracle.rs index a207e4f..8a0c8ad 100644 --- a/src/utils/oracle.rs +++ b/src/utils/oracle.rs @@ -1,19 +1,17 @@ use std::collections::BTreeMap; use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; use bytes::Bytes; +use tokio::sync::{Mutex, Notify}; /// In-memory oracle reader that returns exact fixture bytes. -/// -/// This is the canonical known-good reader for harnesses and integration -/// tests. #[derive(Clone)] pub struct Reader { data: Arc>, } impl Reader { - /// Creates a new oracle reader from deterministic fixture bytes. pub fn new(data: Bytes) -> Self { Self { data: Arc::new(data.to_vec()), @@ -38,27 +36,122 @@ impl crate::Reader for Reader { } } -/// In-memory oracle writer that stores exact extents by offset. -/// -/// This is the canonical known-good writer for harnesses and integration -/// tests. +/// In-memory oracle writer that stores exact cache payloads by key. #[derive(Default, Clone)] pub struct Writer { - extents: Arc>>, + entries: Arc>>, } impl crate::Writer for Writer { - async fn create_extent(&mut self, offset: usize, data: Bytes) -> std::io::Result<()> { - self.extents.lock().await.insert(offset, data); + async fn set_cache(&mut self, key: &str, value: &[u8]) -> std::io::Result<()> { + self.entries.lock().await.insert(key.to_owned(), Bytes::copy_from_slice(value)); Ok(()) } - async fn read_extent(&self, offset: usize) -> std::io::Result { - Ok(self.extents.lock().await.get(&offset).cloned().unwrap_or_else(Bytes::new)) + async fn get_cache(&self, key: &str) -> std::io::Result> { + Ok(self.entries.lock().await.get(key).cloned()) } - async fn delete_extent(&mut self, offset: usize) -> std::io::Result<()> { - self.extents.lock().await.remove(&offset); + async fn delete_cache(&mut self, key: &str) -> std::io::Result<()> { + self.entries.lock().await.remove(key); + Ok(()) + } +} + +/// One-shot gate for pausing the next metadata mutation. +#[derive(Default, Clone)] +pub struct MetadataMutationGate { + pending_pause: Arc, + blocked: Arc, + entered: Arc, + resume: Arc, +} + +impl MetadataMutationGate { + pub fn pause_next_mutation(&self) { + self.pending_pause.store(true, Ordering::SeqCst); + self.blocked.store(false, Ordering::SeqCst); + } + + pub async fn wait_until_blocked(&self) { + while !self.blocked.load(Ordering::SeqCst) { + self.entered.notified().await; + } + } + + pub fn resume(&self) { + self.blocked.store(false, Ordering::SeqCst); + self.resume.notify_waiters(); + } + + async fn pass_if_paused(&self) { + if self.pending_pause.swap(false, Ordering::SeqCst) { + let resume = self.resume.notified(); + self.blocked.store(true, Ordering::SeqCst); + self.entered.notify_waiters(); + resume.await; + } + } +} + +/// In-memory metadata oracle that only understands key/value storage. +#[derive(Clone, Default)] +pub struct Metadata { + entries: Arc>>, + mutation_gate: MetadataMutationGate, +} + +impl Metadata { + pub fn new() -> Self { + Self::default() + } + + pub fn mutation_gate(&self) -> MetadataMutationGate { + self.mutation_gate.clone() + } +} + +impl crate::metadata::Metadata for Metadata { + async fn set(&mut self, key: &str, value: V) -> std::io::Result<()> + where + V: serde::Serialize + Send, + { + self.mutation_gate.pass_if_paused().await; + self.entries + .lock() + .await + .insert(key.to_owned(), crate::common::codec::encode_value(&value)?); + Ok(()) + } + + async fn get(&self, key: &str) -> std::io::Result> + where + V: serde::de::DeserializeOwned + Send, + { + self.entries + .lock() + .await + .get(key) + .map(|value| crate::common::codec::decode_value(value)) + .transpose() + } + + async fn get_by_prefix(&self, prefix: &str) -> std::io::Result> + where + V: serde::de::DeserializeOwned + Send, + { + let mut entries = Vec::new(); + for (key, value) in self.entries.lock().await.iter() { + if key.starts_with(prefix) { + entries.push((key.clone(), crate::common::codec::decode_value(value)?)); + } + } + Ok(entries) + } + + async fn delete(&mut self, key: &str) -> std::io::Result<()> { + self.mutation_gate.pass_if_paused().await; + self.entries.lock().await.remove(key); Ok(()) } } diff --git a/src/writer.rs b/src/writer.rs index 72e6b1a..8b574ee 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -1,39 +1,34 @@ -use bytes::Bytes; - -/// Trait describing how SparseIO stores sparse extents. +/// Trait describing how SparseIO stores cache payload bytes. /// -/// Implementations must preserve exact byte fidelity for stored extents and +/// Implementations must preserve exact byte fidelity for stored payloads and /// expose stable, contract-level behavior: -/// - `read_extent()` returns the exact bytes previously written at that offset. -/// - missing extents are represented as empty bytes. -/// - deleting a missing extent is a no-op. -/// - writing the same offset twice is last-write-wins. +/// - `get_cache()` returns the exact bytes previously written at the same key. +/// - missing cache entries are represented as `None`. +/// - deleting a missing cache entry is a no-op. +/// - writing the same key twice is last-write-wins. /// /// Some examples include: -/// - A networked store that uses KV storage to store extents, allowing for distributed sparse objects. -/// - A file-backed store that uses an OS-provided sparse file . +/// - A networked store that uses KV storage to store cache payloads, allowing for distributed sparse objects. +/// - A file-backed store that uses a cache directory of content files. /// - A hybrid-store that caches hot areas of files to disk and less-frequently accessed portions to cheaper storage /// like S3. pub trait Writer: Send { - /// Creates or overwrites the extent at `offset` with `data`. + /// Creates or overwrites the payload stored at `key` with `value`. /// - /// The bytes written must be readable back verbatim via `read_extent()` - /// at the same offset. If the same offset is written more than once, the - /// most recent write wins. - fn create_extent( - &mut self, - offset: usize, - data: bytes::Bytes, - ) -> impl std::future::Future> + Send; + /// The bytes written must be readable back verbatim via `get_cache()` at + /// the same key. If the same key is written more than once, the most + /// recent write wins. + fn set_cache(&mut self, key: &str, value: &[u8]) + -> impl std::future::Future> + Send; - /// Reads the extent stored at `offset`. + /// Reads the payload stored at `key`. /// - /// If no extent exists at the requested offset, implementations must - /// return empty bytes rather than an error. - fn read_extent(&self, offset: usize) -> impl std::future::Future> + Send; + /// If no payload exists at the requested key, implementations must return + /// `None` rather than an error. + fn get_cache(&self, key: &str) -> impl std::future::Future>> + Send; - /// Deletes the extent stored at `offset`. + /// Deletes the payload stored at `key`. /// - /// Deleting a missing extent must succeed without error. - fn delete_extent(&mut self, offset: usize) -> impl std::future::Future> + Send; + /// Deleting a missing cache payload must succeed without error. + fn delete_cache(&mut self, key: &str) -> impl std::future::Future> + Send; } diff --git a/tests/core.rs b/tests/core.rs index 3a6844f..8f79fe9 100644 --- a/tests/core.rs +++ b/tests/core.rs @@ -1,24 +1,26 @@ -#![cfg(feature = "utils")] +#![cfg(feature = "test-utils")] use std::sync::Arc; use bytes::Bytes; -use futures::StreamExt; use sparseio::Builder; -use sparseio::utils::{counting, fixture, flaky, oracle, tracing}; - -/// Builds a SparseIO instance for the integration tests using the supplied -/// reader, writer, and chunk size. -/// -/// Keeping this helper local avoids repeating builder setup across scenarios -/// while still making the configured contract explicit in one place. -async fn build_io(reader: R, writer: W, chunk_size: usize) -> sparseio::SparseIO +use sparseio::utils::{counting, fixture, oracle, tracing}; + +async fn build_io( + object_id: impl Into, + reader: R, + writer: W, + metadata: oracle::Metadata, + chunk_size: usize, +) -> sparseio::SparseIO where R: sparseio::Reader + Send + Sync + 'static, W: sparseio::Writer + Send + Sync + 'static, { Builder::new() + .object_id(object_id) .chunk_size(chunk_size) + .metadata(metadata) .reader(reader) .writer(writer) .build() @@ -26,17 +28,28 @@ where .expect("builder should succeed") } -/// This test fixes the contract around builder validation so downstream -/// harnesses can rely on explicit failures instead of panics or silent -/// defaults when required inputs are missing. #[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn builder_validation_rejects_missing_fields_and_zero_chunk_size() { +async fn builder_validation_rejects_missing_fields() { tracing::init(); let reader = oracle::Reader::new(fixture::bytes(32)); let writer = oracle::Writer::default(); - let missing_reader = match Builder::::new() + let missing_object_id = match Builder::::new() + .metadata(oracle::Metadata::new()) + .reader(reader.clone()) + .writer(writer.clone()) + .build() + .await + { + Ok(_) => panic!("missing object id should fail"), + Err(err) => err, + }; + assert_eq!(missing_object_id.kind(), std::io::ErrorKind::InvalidInput); + + let missing_reader = match Builder::::new() + .object_id("test://missing-reader") + .metadata(oracle::Metadata::new()) .writer(writer.clone()) .build() .await @@ -46,7 +59,9 @@ async fn builder_validation_rejects_missing_fields_and_zero_chunk_size() { }; assert_eq!(missing_reader.kind(), std::io::ErrorKind::InvalidInput); - let missing_writer = match Builder::::new() + let missing_writer = match Builder::::new() + .object_id("test://missing-writer") + .metadata(oracle::Metadata::new()) .reader(reader.clone()) .build() .await @@ -56,47 +71,19 @@ async fn builder_validation_rejects_missing_fields_and_zero_chunk_size() { }; assert_eq!(missing_writer.kind(), std::io::ErrorKind::InvalidInput); - let zero_chunk = match Builder::new().chunk_size(0).reader(reader).writer(writer).build().await { - Ok(_) => panic!("zero chunk size should fail"), + let missing_metadata = match Builder::::new() + .object_id("test://missing-metadata") + .reader(reader) + .writer(writer) + .build() + .await + { + Ok(_) => panic!("missing metadata should fail"), Err(err) => err, }; - assert_eq!(zero_chunk.kind(), std::io::ErrorKind::InvalidInput); -} - -/// This test exercises the common read path at chunk-aligned and -/// unaligned offsets so SparseIO cannot regress to only handling exact -/// chunk boundaries. -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn aligned_and_unaligned_viewer_reads_match_the_fixture() { - tracing::init(); - - let fixture = fixture::bytes(96); - let io = Arc::new( - build_io( - counting::Reader::new(oracle::Reader::new(fixture.clone())), - counting::Writer::new(oracle::Writer::default()), - 16, - ) - .await, - ); - - let mut viewer = io.viewer(); - viewer.seek(5).expect("seek should succeed"); - - let mut buf = vec![0u8; 37]; - let read = viewer.read(&mut buf).await.expect("read should succeed"); - assert_eq!(read, 37); - assert_eq!(buf, fixture.slice(5..42).to_vec()); - - viewer.seek(16).expect("aligned seek should succeed"); - let mut aligned = vec![0u8; 16]; - let read = viewer.read(&mut aligned).await.expect("aligned read should succeed"); - assert_eq!(read, 16); - assert_eq!(aligned, fixture.slice(16..32).to_vec()); + assert_eq!(missing_metadata.kind(), std::io::ErrorKind::InvalidInput); } -/// This test ensures the first miss materializes an extent and the second -/// read is served from the cache layer rather than re-fetching upstream. #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn cache_transitions_from_uncached_to_cached_without_extra_upstream_reads() { tracing::init(); @@ -104,152 +91,118 @@ async fn cache_transitions_from_uncached_to_cached_without_extra_upstream_reads( let fixture = fixture::bytes(64); let reader = counting::Reader::new(oracle::Reader::new(fixture.clone())); let writer = counting::Writer::new(oracle::Writer::default()); - let io = Arc::new(build_io(reader.clone(), writer.clone(), 16).await); + let io = Arc::new(build_io("test://cache-hit", reader.clone(), writer.clone(), oracle::Metadata::new(), 16).await); let first = io.read_chunk(0).await.expect("first read should succeed"); let second = io.read_chunk(0).await.expect("second read should succeed"); assert_eq!(first, second); assert_eq!(reader.read_count(), 1, "upstream reader should be used once"); - assert_eq!(writer.create_count(), 1, "the extent should be materialized once"); - assert_eq!(writer.read_count(), 1, "the cached re-read should come from the writer"); -} - -/// This test documents the intended `read_chunk` contract: callers may pass an -/// unaligned offset and receive the full chunk that contains that logical byte. -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn read_chunk_normalizes_to_the_containing_chunk() { - tracing::init(); - - let fixture = fixture::bytes(64); - let io = Arc::new( - build_io( - counting::Reader::new(oracle::Reader::new(fixture.clone())), - counting::Writer::new(oracle::Writer::default()), - 16, - ) - .await, - ); - - let chunk = io.read_chunk(17).await.expect("unaligned chunk read should succeed"); - assert_eq!(chunk, fixture.slice(16..32)); + assert_eq!(writer.create_count(), 1, "payload should be materialized once"); + assert_eq!(writer.read_count(), 1, "cached re-read should come from the writer"); } -/// This test protects the in-flight dedupe path so concurrent callers at -/// the same offset do not multiply upstream work. #[tokio::test(flavor = "multi_thread", worker_threads = 4)] async fn same_offset_concurrency_dedupe_shares_one_upstream_fetch() { tracing::init(); let fixture = fixture::bytes(128); - let reader = counting::Reader::new(oracle::Reader::new(fixture.clone())); - let writer = counting::Writer::new(oracle::Writer::default()); - let io = Arc::new(build_io(reader.clone(), writer, 32).await); + let reader = counting::Reader::new(oracle::Reader::new(fixture.clone())) + .with_read_delay(std::time::Duration::from_millis(10)); + let io = Arc::new( + build_io("test://dedupe", reader.clone(), oracle::Writer::default(), oracle::Metadata::new(), 32).await, + ); - let handles: Vec<_> = (0..12) + let tasks: Vec<_> = (0..8) .map(|_| { let io = io.clone(); tokio::spawn(async move { io.read_chunk(0).await }) }) .collect(); - for handle in handles { - let chunk = handle.await.expect("task should join").expect("chunk read should succeed"); + for task in tasks { + let chunk = task.await.expect("task should join").expect("chunk read should succeed"); assert_eq!(chunk, fixture.slice(0..32)); } - assert_eq!(reader.read_count(), 1, "same-offset concurrency should dedupe"); + assert_eq!(reader.read_count(), 1); } -/// This test verifies that dedupe does not leak across independent chunks -/// and that concurrent reads at different offsets still return the right -/// bytes. -#[tokio::test(flavor = "multi_thread", worker_threads = 4)] -async fn different_offset_concurrency_returns_correct_chunks() { +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn shared_chunks_survive_one_object_clear_until_last_reference_is_removed() { tracing::init(); - let fixture = fixture::bytes(192); - let reader = counting::Reader::new(oracle::Reader::new(fixture.clone())); - let writer = counting::Writer::new(oracle::Writer::default()); - let io = Arc::new(build_io(reader.clone(), writer, 32).await); + let data = fixture::bytes(32); + let metadata = oracle::Metadata::new(); - let offsets = [0usize, 32, 64, 96]; - let handles: Vec<_> = offsets - .into_iter() - .map(|offset| { - let io = io.clone(); - tokio::spawn(async move { (offset, io.read_chunk(offset).await) }) - }) - .collect(); + let first = Arc::new( + build_io("test://first", oracle::Reader::new(data.clone()), oracle::Writer::default(), metadata.clone(), 32) + .await, + ); + let second = Arc::new( + build_io("test://second", oracle::Reader::new(data.clone()), oracle::Writer::default(), metadata.clone(), 32) + .await, + ); - for handle in handles { - let (offset, chunk) = handle.await.expect("task should join"); - let chunk = chunk.expect("chunk read should succeed"); - let expected = fixture.slice(offset..offset + 32); - assert_eq!(chunk, expected, "chunk at offset {offset} should match the fixture"); - } + assert_eq!(first.read_chunk(0).await.expect("first read should work"), data); + assert_eq!(second.read_chunk(0).await.expect("second read should work"), data); - assert_eq!(reader.read_count(), offsets.len(), "different offsets should fetch independently"); -} + let mut first_viewer = first.viewer(); + first_viewer.clear_cache().await.expect("first clear should succeed"); -/// This test checks the stream path separately from buffered reads so the -/// byte stream remains a parity-preserving view over the same data. -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn bytestream_matches_the_fixture_payload() { - tracing::init(); + assert_eq!( + second.read_chunk(0).await.expect("second object should still be readable"), + data, + "clearing one object should not remove shared payloads still needed by another object" + ); - let fixture = fixture::bytes(80); - let io = Arc::new( + let mut second_viewer = second.viewer(); + second_viewer.clear_cache().await.expect("second clear should succeed"); + + let reopened = Arc::new( build_io( - counting::Reader::new(oracle::Reader::new(fixture.clone())), - counting::Writer::new(oracle::Writer::default()), - 16, + "test://second", + counting::Reader::new(oracle::Reader::new(data.clone())), + oracle::Writer::default(), + metadata, + 32, ) .await, ); - - let mut viewer = io.viewer(); - viewer.seek(7).expect("seek should succeed"); - let mut stream = viewer.to_bytestream().await; - let mut collected = Vec::new(); - while let Some(chunk) = stream.next().await { - collected.extend_from_slice(&chunk.expect("stream chunk should succeed")); - } - - assert_eq!(Bytes::from(collected), fixture.slice(7..)); -} - -/// This test ensures a failing upstream read does not poison the in-flight -/// map, otherwise a transient reader error could permanently wedge the -/// chunk. -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn transient_reader_failures_cleanup_flights_and_allow_retry() { - tracing::init(); - - let fixture = fixture::bytes(64); - let reader = counting::Reader::new(flaky::Reader::fail_once_at(fixture.clone(), [0])); - let writer = counting::Writer::new(oracle::Writer::default()); - let io = Arc::new(build_io(reader.clone(), writer, 16).await); - - assert!(io.read_chunk(0).await.is_err(), "first transient failure should surface"); - let chunk = io.read_chunk(0).await.expect("retry should succeed"); - assert_eq!(chunk, fixture.slice(0..16)); - assert_eq!(reader.read_count(), 2, "retry should re-enter the upstream reader after cleanup"); + assert_eq!( + reopened.read_chunk(0).await.expect("reopened object should still read"), + data, + "after clearing both objects, a reopened read should refetch from upstream instead of relying on stale metadata" + ); } -/// This test exercises the writer failure path so a failed materialization -/// can be retried instead of leaving a stale flight behind. #[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn transient_writer_failures_cleanup_flights_and_allow_retry() { +async fn reopened_object_reuses_persisted_chunk_size_and_length() { tracing::init(); let fixture = fixture::bytes(64); - let reader = counting::Reader::new(oracle::Reader::new(fixture.clone())); - let writer = flaky::Writer::fail_once_at([0]); - let io = Arc::new(build_io(reader.clone(), writer, 16).await); - - assert!(io.read_chunk(0).await.is_err(), "first transient writer failure should surface"); - let chunk = io.read_chunk(0).await.expect("retry should succeed"); - assert_eq!(chunk, fixture.slice(0..16)); - assert_eq!(reader.read_count(), 2, "writer failure retry should refetch after cleanup"); + let metadata = oracle::Metadata::new(); + let first = Arc::new( + build_io( + "test://reopen", + oracle::Reader::new(fixture.clone()), + oracle::Writer::default(), + metadata.clone(), + 16, + ) + .await, + ); + assert_eq!(first.read_chunk(48).await.expect("tail should materialize"), fixture.slice(48..64)); + + let reopened = build_io( + "test://reopen", + oracle::Reader::new(Bytes::from_static(b"this length should be ignored")), + oracle::Writer::default(), + metadata, + 32, + ) + .await; + + assert_eq!(reopened.chunk_size(), 16); + assert_eq!(reopened.len(), 64); } diff --git a/tests/file.rs b/tests/file.rs index e5a4821..1fdcc8d 100644 --- a/tests/file.rs +++ b/tests/file.rs @@ -1,46 +1,43 @@ -#![cfg(feature = "file")] +#![cfg(all(feature = "impl-file", feature = "test-utils"))] -use std::collections::HashSet; use std::fs; use std::path::Path; use std::sync::Arc; +use std::time::Duration; -use bytes::Bytes; use sparseio::sources::file::{Reader, Writer}; -use sparseio::utils::{file, fixture, materialization, temp, tracing}; -use sparseio::{Builder, Reader as _, Writer as _}; +use sparseio::utils::{counting, fixture, temp, tracing}; +use sparseio::{Builder, Reader as _}; -/// Joins a temporary directory with a file name used by the file-backed tests. -fn temp_file(dir: &Path, name: &str) -> std::path::PathBuf { +fn temp_path(dir: &Path, name: &str) -> std::path::PathBuf { temp::temp_path(dir, name) } -/// Builds a file-backed SparseIO instance for the given source and -/// destination paths. async fn build_file_io( src_path: std::path::PathBuf, - dst_path: std::path::PathBuf, + cache_dir: std::path::PathBuf, + metadata: sparseio::utils::oracle::Metadata, chunk_size: usize, -) -> Arc> { +) -> Arc> { Arc::new( Builder::new() .chunk_size(chunk_size) + .object_id(format!("file://{}", src_path.display())) + .metadata(metadata) .reader(Reader::new(src_path)) - .writer(Writer::new(dst_path)) + .writer(Writer::new(cache_dir)) .build() .await .expect("file-backed SparseIO should build"), ) } -/// This test pins the file reader contract at the boundary conditions so -/// EOF behavior stays explicit rather than inferred from Tokio internals. #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn file_reader_truncates_tail_and_reports_eof() -> Result<(), Box> { tracing::init(); let dir = temp::temp_dir(); - let src_path = temp_file(dir.path(), "reader.bin"); + let src_path = temp_path(dir.path(), "reader.bin"); fs::write(&src_path, b"hello sparseio")?; let reader = Reader::new(&src_path); @@ -54,119 +51,143 @@ async fn file_reader_truncates_tail_and_reports_eof() -> Result<(), Box Result<(), Box> { +async fn local_file_reader_reuses_file_cache_metadata() -> Result<(), Box> { tracing::init(); let dir = temp::temp_dir(); - let dst_path = temp_file(dir.path(), "writer.bin"); - let mut writer = Writer::new(&dst_path); - - writer.delete_extent(0).await?; - writer.create_extent(64, Bytes::from_static(b"first")).await?; - writer.create_extent(64, Bytes::from_static(b"second")).await?; - writer.create_extent(256, Bytes::from_static(b"tail")).await?; + let src_path = temp_path(dir.path(), "reload-src.bin"); + let cache_dir = temp_path(dir.path(), "reload-cache"); + let fixture = fixture::bytes(64); + let metadata = sparseio::utils::oracle::Metadata::new(); + fs::write(&src_path, &fixture)?; - assert_eq!(writer.read_extent(64).await?, Bytes::from_static(b"second")); - assert_eq!(writer.read_extent(128).await?, Bytes::new()); - assert_eq!(writer.read_extent(256).await?, Bytes::from_static(b"tail")); + let first = build_file_io(src_path.clone(), cache_dir.clone(), metadata.clone(), 16).await; + assert_eq!(first.read_chunk(0).await?, fixture.slice(0..16)); + assert_eq!(first.read_chunk(48).await?, fixture.slice(48..64)); - writer.delete_extent(64).await?; - assert!(writer.read_extent(64).await?.is_empty()); + let second = build_file_io(src_path, cache_dir, metadata, 32).await; + assert_eq!(second.chunk_size(), 16); + assert_eq!(second.read_chunk(0).await?, fixture.slice(0..16)); + assert_eq!(second.read_chunk(48).await?, fixture.slice(48..64)); Ok(()) } -/// This test mirrors the example's sparse materialization checks so the -/// example and test behavior stay aligned when materialization logic shifts. #[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn partial_materialization_preserves_sparse_gaps() -> Result<(), Box> { +async fn file_writer_deduplicates_identical_chunks_across_identities() -> Result<(), Box> { tracing::init(); let dir = temp::temp_dir(); - let src_path = temp_file(dir.path(), "partial-src.bin"); - let dst_path = temp_file(dir.path(), "partial-dst.bin"); - let fixture = fixture::bytes(128); - fs::write(&src_path, &fixture)?; - - let io = build_file_io(src_path.clone(), dst_path.clone(), 16).await; - let mut viewer = io.viewer(); - let mut filled = HashSet::new(); - for offset in [0usize, 32, 96] { - viewer.seek(offset)?; - let mut buffer = vec![0u8; 16]; - let read = viewer.read(&mut buffer).await?; - assert_eq!(read, 16); - assert_eq!(buffer, fixture.slice(offset..offset + 16).to_vec()); - filled.insert(offset); - } + let cache_dir = temp_path(dir.path(), "shared-cache"); + let src_a = temp_path(dir.path(), "source-a.bin"); + let src_b = temp_path(dir.path(), "source-b.bin"); + let data = fixture::bytes(32); + let metadata = sparseio::utils::oracle::Metadata::new(); + fs::write(&src_a, &data)?; + fs::write(&src_b, &data)?; + + let first = build_file_io(src_a.clone(), cache_dir.clone(), metadata.clone(), 32).await; + assert_eq!(first.read_chunk(0).await?, data.slice(0..32)); + + let second = build_file_io(src_b.clone(), cache_dir.clone(), metadata, 32).await; + assert_eq!(second.read_chunk(0).await?, data.slice(0..32)); + + let names = fs::read_dir(&cache_dir)? + .map(|entry| entry.map(|entry| entry.file_name().to_string_lossy().into_owned())) + .collect::, _>>()?; + assert_eq!(names.len(), 1); + + let mut first_viewer = first.viewer(); + first_viewer.clear_cache().await?; + + let remaining_after_first_clear = fs::read_dir(&cache_dir)?.count(); + assert_eq!( + remaining_after_first_clear, 1, + "shared chunk should remain while second identity still references it" + ); + + let mut second_viewer = second.viewer(); + second_viewer.clear_cache().await?; + let remaining_after_second_clear = fs::read_dir(&cache_dir)?.count(); + assert_eq!(remaining_after_second_clear, 0, "chunk should be deleted once the final reference is cleared"); - materialization::verify_partial_materialization(&src_path, &dst_path, &filled, 16, fixture.len())?; Ok(()) } -/// This test validates the fully materialized end state and ensures the -/// file-backed example path and file-backed integration tests share the -/// same observable contract. -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn full_materialization_matches_the_source_file() -> Result<(), Box> { +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +async fn file_sparseio_repairs_missing_chunk_file_on_reopen() -> Result<(), Box> { tracing::init(); let dir = temp::temp_dir(); - let src_path = temp_file(dir.path(), "full-src.bin"); - let dst_path = temp_file(dir.path(), "full-dst.bin"); - let fixture = fixture::bytes(160); + let src_path = temp_path(dir.path(), "repair-src.bin"); + let cache_dir = temp_path(dir.path(), "repair-cache"); + let fixture = fixture::bytes(64); + let metadata = sparseio::utils::oracle::Metadata::new(); fs::write(&src_path, &fixture)?; - let io = build_file_io(src_path.clone(), dst_path.clone(), 32).await; - let mut viewer = io.viewer(); - for offset in (0..fixture.len()).step_by(32) { - viewer.seek(offset)?; - let mut buffer = vec![0u8; 32.min(fixture.len() - offset)]; - let read = viewer.read(&mut buffer).await?; - assert_eq!(read, buffer.len()); - assert_eq!(buffer, fixture.slice(offset..offset + buffer.len()).to_vec()); - } + let first = build_file_io(src_path.clone(), cache_dir.clone(), metadata.clone(), 16).await; + assert_eq!(first.read_chunk(0).await?, fixture.slice(0..16)); + + let chunk_path = fs::read_dir(&cache_dir)? + .next() + .expect("cache directory should contain one chunk")? + .path(); + fs::remove_file(&chunk_path)?; - materialization::verify_full_materialization(&src_path, &dst_path)?; + let reader = counting::Reader::new(Reader::new(src_path.clone())); + let reopened = Arc::new( + Builder::new() + .object_id(format!("file://{}", src_path.display())) + .chunk_size(32) + .metadata(metadata) + .reader(reader.clone()) + .writer(Writer::new(cache_dir.clone())) + .build() + .await?, + ); + + assert_eq!(reopened.chunk_size(), 16); + assert_eq!(reopened.read_chunk(0).await?, fixture.slice(0..16)); + assert_eq!(reader.read_count(), 1, "missing chunk file should trigger one upstream refetch"); + assert_eq!(fs::read_dir(&cache_dir)?.count(), 1, "repaired chunk should be recreated on disk"); Ok(()) } -/// This test checks the platform-specific sparse-file observation that the -/// example demonstrates, without making assumptions about internal extent -/// segmentation. -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn allocated_size_stays_below_logical_size_on_supported_platforms() -> Result<(), Box> { +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +async fn file_sparseio_dedupes_same_offset_concurrency_with_real_backend() -> Result<(), Box> { tracing::init(); let dir = temp::temp_dir(); - let src_path = temp_file(dir.path(), "sparse-src.bin"); - let dst_path = temp_file(dir.path(), "sparse-dst.bin"); - let fixture = fixture::bytes(4 * 1024); + let src_path = temp_path(dir.path(), "concurrency-src.bin"); + let cache_dir = temp_path(dir.path(), "concurrency-cache"); + let fixture = fixture::bytes(96); fs::write(&src_path, &fixture)?; - fs::OpenOptions::new() - .create(true) - .write(true) - .truncate(true) - .open(&dst_path)? - .set_len(fixture.len() as u64)?; - - let io = build_file_io(src_path.clone(), dst_path.clone(), 512).await; - let mut viewer = io.viewer(); - viewer.seek(0)?; - let mut buffer = vec![0u8; 512]; - viewer.read(&mut buffer).await?; - viewer.seek(fixture.len() - 512)?; - let mut tail = vec![0u8; 512]; - viewer.read(&mut tail).await?; - - let logical = fs::metadata(&dst_path)?.len(); - let allocated = file::allocated_bytes(&dst_path)?.unwrap_or(0); - assert_eq!(logical, fixture.len() as u64); - if allocated > 0 { - assert!(allocated <= logical, "allocated bytes should not exceed logical bytes for a sparse file"); + + let reader = counting::Reader::new(Reader::new(src_path)).with_read_delay(Duration::from_millis(10)); + let writer = counting::Writer::new(Writer::new(cache_dir.clone())); + let io = Arc::new( + Builder::new() + .chunk_size(16) + .object_id("file://concurrency-src") + .metadata(sparseio::utils::oracle::Metadata::new()) + .reader(reader.clone()) + .writer(writer.clone()) + .build() + .await?, + ); + + let tasks: Vec<_> = (0..8) + .map(|_| { + let io = io.clone(); + tokio::spawn(async move { io.read_chunk(0).await }) + }) + .collect(); + for task in tasks { + assert_eq!(task.await??, fixture.slice(0..16)); } + + assert_eq!(reader.read_count(), 1, "same-offset file reads should dedupe upstream work"); + assert_eq!(writer.create_count(), 1, "only one chunk should be materialized into the file cache"); + assert_eq!(fs::read_dir(&cache_dir)?.count(), 1, "only one on-disk chunk file should be created"); Ok(()) } diff --git a/tests/harness.rs b/tests/harness.rs index e1fdbf9..031bf6a 100644 --- a/tests/harness.rs +++ b/tests/harness.rs @@ -1,6 +1,8 @@ #![cfg(feature = "debug")] -use sparseio::debug::{ReaderHarness, ReaderHarnessConfig, WriterHarness, WriterHarnessConfig}; +use sparseio::debug::{ + MetadataHarness, MetadataHarnessConfig, ReaderHarness, ReaderHarnessConfig, WriterHarness, WriterHarnessConfig, +}; use sparseio::utils::{fixture, oracle, tracing}; /// This test proves the public reader harness can validate a correct @@ -40,3 +42,21 @@ async fn writer_harness_validates_oracle_writer_end_to_end() { harness.validate().await.expect("oracle writer should satisfy the harness"); } + +/// This test proves the public metadata harness can validate a correct +/// metadata store against direct load/replace/clear and refcount semantics. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn metadata_harness_validates_oracle_metadata_end_to_end() { + tracing::init(); + + let fixture = fixture::bytes(96); + let harness = MetadataHarness::new( + oracle::Metadata::new(), + MetadataHarnessConfig { + chunk_size: 16, + fixture, + }, + ); + + harness.validate().await.expect("oracle metadata should satisfy the harness"); +} diff --git a/tests/http.rs b/tests/http.rs deleted file mode 100644 index ca13102..0000000 --- a/tests/http.rs +++ /dev/null @@ -1,207 +0,0 @@ -#![cfg(feature = "http")] - -use ::mockito::Matcher; -use sparseio::Reader as _; -use sparseio::sources::http::Reader; -use sparseio::utils::tracing; - -/// This test covers the simplest discovery path so HTTP length probing -/// keeps using HEAD metadata when it is present and valid. -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn head_content_length_discovers_length() { - tracing::init(); - - let mut server = mockito::Server::new_async().await; - let _m = server - .mock("HEAD", "/asset") - .with_status(200) - .with_header("content-length", "12") - .create_async() - .await; - - let reader = Reader::with_client(reqwest::Client::new(), server.url() + "/asset"); - assert_eq!(reader.len().await.expect("len should succeed"), 12); -} - -/// This test covers the range-based fallback used when HEAD metadata is -/// incomplete but the server still honors ranged GET requests. -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn content_range_fallback_discovers_length() { - tracing::init(); - - let mut server = mockito::Server::new_async().await; - let _head = server.mock("HEAD", "/asset").with_status(200).create_async().await; - let _range = server - .mock("GET", "/asset") - .match_header("range", "bytes=0-0") - .with_status(206) - .with_header("content-range", "bytes 0-0/99") - .with_body("a") - .create_async() - .await; - - let reader = Reader::with_client(reqwest::Client::new(), server.url() + "/asset"); - assert_eq!(reader.len().await.expect("len should succeed"), 99); -} - -/// This test covers servers that return a full 200 response for the range -/// probe and only expose the payload length through the body itself. -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn ok_body_length_fallback_discovers_length() { - tracing::init(); - - let mut server = mockito::Server::new_async().await; - let _head = server.mock("HEAD", "/asset").with_status(405).create_async().await; - let _range = server - .mock("GET", "/asset") - .match_header("range", "bytes=0-0") - .with_status(200) - .with_body("hello") - .create_async() - .await; - - let reader = Reader::with_client(reqwest::Client::new(), server.url() + "/asset"); - assert_eq!(reader.len().await.expect("len should succeed"), 5); -} - -/// This test pins the emitted Range header so the HTTP reader keeps asking -/// for the exact byte window the caller requested. -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn range_header_shape_matches_offset_and_buffer_length() { - tracing::init(); - - let mut server = mockito::Server::new_async().await; - let _range = server - .mock("GET", "/asset") - .match_header("range", Matcher::Exact("bytes=4-7".to_string())) - .with_status(206) - .with_header("content-range", "bytes 4-7/8") - .with_body("efgh") - .create_async() - .await; - let reader = Reader::with_client(reqwest::Client::new(), server.url() + "/asset"); - - let mut buf = [0u8; 4]; - let read = reader.read_at(4, &mut buf).await.expect("read should succeed"); - assert_eq!(read, 4); - assert_eq!(&buf, b"efgh"); -} - -/// This test keeps the EOF mapping explicit for sparse readers that probe -/// past the end of the remote object. -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn range_not_satisfiable_maps_to_eof() { - tracing::init(); - - let mut server = mockito::Server::new_async().await; - let _range = server - .mock("GET", "/asset") - .match_header("range", "bytes=8-11") - .with_status(416) - .create_async() - .await; - let reader = Reader::with_client(reqwest::Client::new(), server.url() + "/asset"); - - let mut buf = [0u8; 4]; - let read = reader.read_at(8, &mut buf).await.expect("EOF should not fail"); - assert_eq!(read, 0); -} - -/// This test ensures callers can override unreliable length metadata -/// without consulting the network path at all. -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn len_override_takes_precedence_over_network_metadata() { - tracing::init(); - - let server = mockito::Server::new_async().await; - let reader = Reader::with_client(reqwest::Client::new(), server.url() + "/asset").with_len_override(1234); - assert_eq!(reader.len().await.expect("override should win"), 1234); -} - -/// This test keeps the parser strict so obviously malformed length headers -/// do not silently produce the wrong object size. -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn malformed_headers_cause_length_discovery_to_fail() { - tracing::init(); - - let mut server = mockito::Server::new_async().await; - let _head = server - .mock("HEAD", "/asset") - .with_status(200) - .with_header("content-length", "not-a-number") - .create_async() - .await; - let _range = server - .mock("GET", "/asset") - .match_header("range", "bytes=0-0") - .with_status(206) - .with_header("content-range", "bytes 0-0/*") - .with_body("a") - .create_async() - .await; - - let reader = Reader::with_client(reqwest::Client::new(), server.url() + "/asset"); - assert!(reader.len().await.is_err(), "malformed headers should not be accepted"); -} - -/// This test covers the case where a server ignores a non-zero Range -/// request and returns an unbounded 200 response instead. -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn ignored_range_errors_at_non_zero_offsets_are_rejected() { - tracing::init(); - - let mut server = mockito::Server::new_async().await; - let _range = server - .mock("GET", "/asset") - .match_header("range", Matcher::Exact("bytes=5-8".to_string())) - .with_status(200) - .with_body("abcdefgh") - .create_async() - .await; - let reader = Reader::with_client(reqwest::Client::new(), server.url() + "/asset"); - - let mut buf = [0u8; 4]; - assert!(reader.read_at(5, &mut buf).await.is_err(), "non-zero offsets require a range-aware response"); -} - -/// This test rejects range responses that claim success for the wrong byte -/// window, which would otherwise corrupt the sparse cache with misaligned data. -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn mismatched_content_range_is_rejected() { - tracing::init(); - - let mut server = mockito::Server::new_async().await; - let _range = server - .mock("GET", "/asset") - .match_header("range", Matcher::Exact("bytes=4-7".to_string())) - .with_status(206) - .with_header("content-range", "bytes 0-3/8") - .with_body("abcd") - .create_async() - .await; - let reader = Reader::with_client(reqwest::Client::new(), server.url() + "/asset"); - - let mut buf = [0u8; 4]; - assert!(reader.read_at(4, &mut buf).await.is_err(), "mismatched ranges must be rejected"); -} - -/// This test rejects truncated mid-object 206 responses so SparseIO never -/// caches a partial chunk unless the server proves it reached EOF. -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn truncated_partial_content_before_eof_is_rejected() { - tracing::init(); - - let mut server = mockito::Server::new_async().await; - let _range = server - .mock("GET", "/asset") - .match_header("range", Matcher::Exact("bytes=4-7".to_string())) - .with_status(206) - .with_header("content-range", "bytes 4-5/16") - .with_body("ef") - .create_async() - .await; - let reader = Reader::with_client(reqwest::Client::new(), server.url() + "/asset"); - - let mut buf = [0u8; 4]; - assert!(reader.read_at(4, &mut buf).await.is_err(), "short mid-object 206 responses must be rejected"); -} diff --git a/tests/opendal.rs b/tests/opendal.rs new file mode 100644 index 0000000..b0b1040 --- /dev/null +++ b/tests/opendal.rs @@ -0,0 +1,85 @@ +#![cfg(all(feature = "impl-opendal", feature = "test-utils"))] + +use std::sync::Arc; +use std::time::Duration; + +use bytes::Bytes; +use sparseio::sources::opendal::Reader; +use sparseio::utils::{counting, oracle}; +use sparseio::{Builder, Reader as _}; + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn opendal_reader_supports_ranged_reads_and_eof() -> Result<(), Box> { + let op = opendal::Operator::new(opendal::services::Memory::default())?.finish(); + op.write("object.bin", Bytes::from_static(b"hello sparseio")).await?; + + let reader = Reader::new(op, "object.bin").await?; + assert_eq!(reader.len().await?, 14); + + let mut range = [0xFFu8; 8]; + let read = reader.read_at(6, &mut range).await?; + assert_eq!(read, 8); + assert_eq!(&range, b"sparseio"); + + let mut tail = [0xAAu8; 8]; + let read = reader.read_at(11, &mut tail).await?; + assert_eq!(read, 3); + assert_eq!(&tail[..3], b"eio"); + assert_eq!(&tail[3..], &[0xAA; 5]); + + let eof = reader.read_at(14, &mut tail).await?; + assert_eq!(eof, 0); + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn opendal_reader_normalizes_paths() -> Result<(), Box> { + let op = opendal::Operator::new(opendal::services::Memory::default())?.finish(); + op.write("nested/object.bin", Bytes::from_static(b"normalize me")).await?; + + let reader = Reader::new(op, "./nested//object.bin").await?; + assert_eq!(reader.path(), "nested/object.bin"); + + let mut empty = []; + assert_eq!(reader.read_at(0, &mut empty).await?, 0, "zero-length reads should be a no-op"); + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +async fn opendal_sparseio_dedupes_same_offset_reads_and_reuses_cache() -> Result<(), Box> { + let op = opendal::Operator::new(opendal::services::Memory::default())?.finish(); + let fixture: Vec = (0..96).map(|index| index as u8).collect(); + op.write("object.bin", Bytes::from(fixture.clone())).await?; + + let reader = counting::Reader::new(Reader::new(op, "object.bin").await?).with_read_delay(Duration::from_millis(10)); + let writer = counting::Writer::new(oracle::Writer::default()); + let io = Arc::new( + Builder::new() + .object_id("memory://nested/object.bin") + .chunk_size(16) + .metadata(oracle::Metadata::new()) + .reader(reader.clone()) + .writer(writer.clone()) + .build() + .await?, + ); + + let tasks: Vec<_> = (0..8) + .map(|_| { + let io = io.clone(); + tokio::spawn(async move { io.read_chunk(0).await }) + }) + .collect(); + for task in tasks { + assert_eq!(task.await??, Bytes::from(fixture[0..16].to_vec())); + } + + assert_eq!(reader.read_count(), 1, "same-offset OpenDAL reads should dedupe upstream work"); + assert_eq!(writer.create_count(), 1, "only one extent should be materialized after the miss"); + + let before_reads = reader.read_count(); + assert_eq!(io.read_chunk(0).await?, Bytes::from(fixture[0..16].to_vec())); + assert_eq!(reader.read_count(), before_reads, "cached OpenDAL chunk should avoid another upstream read"); + assert!(writer.read_count() >= 1, "cached replay should read from the writer path"); + Ok(()) +}