From af0a097921c5c6aa98e4bab3719ffce774241c31 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Thu, 23 Apr 2026 08:02:03 -0700 Subject: [PATCH 1/5] disable startFrom for OneLake endpoints --- src/azure/mod.rs | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/azure/mod.rs b/src/azure/mod.rs index e6b9b9c2..3e4c005f 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -127,11 +127,20 @@ impl ObjectStore for MicrosoftAzure { prefix: Option<&Path>, offset: &Path, ) -> BoxStream<'static, Result> { - if self.client.config().is_emulator { - // Azurite doesn't support the startFrom query parameter, + let disable_start_from = self.client.config().is_emulator + || self + .client + .config() + .service + .host_str() + .is_some_and(|h| h.ends_with(".fabric.microsoft.com")); + + if disable_start_from { + // Azurite and OneLake don't support the startFrom query parameter, // fall back to client-side filtering // // See https://github.com/Azure/Azurite/issues/2619#issuecomment-3660701055 + // See https://github.com/apache/arrow-rs-object-store/issues/695 let offset = offset.clone(); self.list(prefix) .try_filter(move |f| futures_util::future::ready(f.location > offset)) From 49f1be74cd279e6491f363a9227f75837d877e08 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Thu, 23 Apr 2026 08:55:16 -0700 Subject: [PATCH 2/5] add integration test --- src/azure/mod.rs | 108 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/src/azure/mod.rs b/src/azure/mod.rs index 3e4c005f..f3967bc0 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -419,6 +419,114 @@ mod tests { assert_eq!(data, loaded); } + /// Verifies that `list_with_offset` works against OneLake (Fabric) endpoints. + /// + /// OneLake silently ignores the `startFrom` query parameter, so the client + /// must fall back to client-side filtering. + /// + /// Set these env vars before running: + /// - `AZURE_STORAGE_TOKEN`: bearer token (e.g. from `az account get-access-token`) + /// - `ONELAKE_URL`: full OneLake table URL, e.g. + /// `https://onelake.dfs.fabric.microsoft.com///Tables/` + /// + /// See + #[ignore = "Used for manual testing against a real OneLake endpoint."] + #[tokio::test] + async fn test_onelake_list_with_offset() { + let url = std::env::var("ONELAKE_URL").unwrap(); + let token = std::env::var("AZURE_STORAGE_TOKEN").unwrap(); + + let store = MicrosoftAzureBuilder::new() + .with_url(&url) + .with_config(AzureConfigKey::Token, token) + .build() + .unwrap(); + + // Derive a writable path prefix from the URL + // (skip workspace segment which becomes the container) + let parsed: Url = url.parse().unwrap(); + let mut segments = parsed.path_segments().unwrap(); + let _workspace = segments.next().unwrap(); + let base: String = segments.collect::>().join("/"); + let test_dir = format!("{base}/test_onelake_offset"); + + // Create test files with predictable ordering + let prefix = Path::from(test_dir.as_str()); + let files: Vec = (b'a'..=b'e') + .map(|c| Path::from(format!("{test_dir}/file_{}.txt", c as char))) + .collect(); + let data = Bytes::from("test data"); + for file in &files { + store.put(file, data.clone().into()).await.unwrap(); + } + + // Test 1: Offset at file_b → should return c, d, e (not b) + let offset = Path::from(format!("{test_dir}/file_b.txt")); + let result: Vec = store + .list_with_offset(Some(&prefix), &offset) + .map_ok(|m| m.location) + .try_collect() + .await + .unwrap(); + assert!( + !result.contains(&offset), + "offset file_b should be excluded, got: {result:?}" + ); + assert_eq!(result.len(), 3, "expected c/d/e after file_b, got: {result:?}"); + + // Test 2: Offset at file_a → should return b, c, d, e + let offset = Path::from(format!("{test_dir}/file_a.txt")); + let result: Vec = store + .list_with_offset(Some(&prefix), &offset) + .map_ok(|m| m.location) + .try_collect() + .await + .unwrap(); + assert!(!result.contains(&offset), "offset file_a should be excluded"); + assert_eq!(result.len(), 4, "expected b/c/d/e after file_a, got: {result:?}"); + + // Test 3: Offset at file_e (last) → should return empty + let offset = Path::from(format!("{test_dir}/file_e.txt")); + let result: Vec = store + .list_with_offset(Some(&prefix), &offset) + .map_ok(|m| m.location) + .try_collect() + .await + .unwrap(); + assert!(result.is_empty(), "offset at last file should return empty, got: {result:?}"); + + // Test 4: Offset before all files → should return all 5 + let offset = Path::from(format!("{test_dir}/file")); + let result: Vec = store + .list_with_offset(Some(&prefix), &offset) + .map_ok(|m| m.location) + .try_collect() + .await + .unwrap(); + assert_eq!(result.len(), 5, "offset before all files should return all, got: {result:?}"); + + // Test 5: Every returned entry is strictly greater than offset + let offset = Path::from(format!("{test_dir}/file_c.txt")); + let result: Vec = store + .list_with_offset(Some(&prefix), &offset) + .try_collect() + .await + .unwrap(); + for meta in &result { + assert!( + meta.location > offset, + "entry {} should be > offset {}", + meta.location, + offset + ); + } + + // Cleanup + for file in &files { + let _ = store.delete(file).await; + } + } + #[test] fn azure_test_config_get_value() { let azure_client_id = "object_store:fake_access_key_id".to_string(); From ec8493dd55652ff8f5607483ce5e78da24f92254 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Thu, 23 Apr 2026 09:08:31 -0700 Subject: [PATCH 3/5] improve docs --- src/azure/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/azure/mod.rs b/src/azure/mod.rs index f3967bc0..cd2b0019 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -426,8 +426,8 @@ mod tests { /// /// Set these env vars before running: /// - `AZURE_STORAGE_TOKEN`: bearer token (e.g. from `az account get-access-token`) - /// - `ONELAKE_URL`: full OneLake table URL, e.g. - /// `https://onelake.dfs.fabric.microsoft.com///Tables/
` + /// - `ONELAKE_URL`: full OneLake URL, e.g. + /// `https://onelake.blob.fabric.microsoft.com///Files/` /// /// See #[ignore = "Used for manual testing against a real OneLake endpoint."] From 53021bb63b5b84a471a730df41e6e9cd948ca1c8 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Thu, 23 Apr 2026 11:51:11 -0700 Subject: [PATCH 4/5] fmt --- src/azure/mod.rs | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/src/azure/mod.rs b/src/azure/mod.rs index cd2b0019..e5384ba7 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -472,7 +472,11 @@ mod tests { !result.contains(&offset), "offset file_b should be excluded, got: {result:?}" ); - assert_eq!(result.len(), 3, "expected c/d/e after file_b, got: {result:?}"); + assert_eq!( + result.len(), + 3, + "expected c/d/e after file_b, got: {result:?}" + ); // Test 2: Offset at file_a → should return b, c, d, e let offset = Path::from(format!("{test_dir}/file_a.txt")); @@ -482,8 +486,15 @@ mod tests { .try_collect() .await .unwrap(); - assert!(!result.contains(&offset), "offset file_a should be excluded"); - assert_eq!(result.len(), 4, "expected b/c/d/e after file_a, got: {result:?}"); + assert!( + !result.contains(&offset), + "offset file_a should be excluded" + ); + assert_eq!( + result.len(), + 4, + "expected b/c/d/e after file_a, got: {result:?}" + ); // Test 3: Offset at file_e (last) → should return empty let offset = Path::from(format!("{test_dir}/file_e.txt")); @@ -493,7 +504,10 @@ mod tests { .try_collect() .await .unwrap(); - assert!(result.is_empty(), "offset at last file should return empty, got: {result:?}"); + assert!( + result.is_empty(), + "offset at last file should return empty, got: {result:?}" + ); // Test 4: Offset before all files → should return all 5 let offset = Path::from(format!("{test_dir}/file")); @@ -503,7 +517,11 @@ mod tests { .try_collect() .await .unwrap(); - assert_eq!(result.len(), 5, "offset before all files should return all, got: {result:?}"); + assert_eq!( + result.len(), + 5, + "offset before all files should return all, got: {result:?}" + ); // Test 5: Every returned entry is strictly greater than offset let offset = Path::from(format!("{test_dir}/file_c.txt")); From c014f56c30099d611909acdc0dc0fcda7ea67368 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Thu, 23 Apr 2026 12:00:12 -0700 Subject: [PATCH 5/5] docs --- src/azure/mod.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/azure/mod.rs b/src/azure/mod.rs index e5384ba7..e059939f 100644 --- a/src/azure/mod.rs +++ b/src/azure/mod.rs @@ -421,13 +421,15 @@ mod tests { /// Verifies that `list_with_offset` works against OneLake (Fabric) endpoints. /// - /// OneLake silently ignores the `startFrom` query parameter, so the client - /// must fall back to client-side filtering. + /// OneLake silently ignores the `startFrom` query parameter when using + /// friendly-name URLs (e.g. `.../MyWorkspace/lakehouse.Lakehouse/...`), + /// returning 200 OK with zero results. + /// GUID-based URLs handle `startFrom` correctly. /// /// Set these env vars before running: /// - `AZURE_STORAGE_TOKEN`: bearer token (e.g. from `az account get-access-token`) - /// - `ONELAKE_URL`: full OneLake URL, e.g. - /// `https://onelake.blob.fabric.microsoft.com///Files/` + /// - `ONELAKE_URL`: full OneLake URL with friendly names, e.g. + /// `https://onelake.blob.fabric.microsoft.com//.Lakehouse/` /// /// See #[ignore = "Used for manual testing against a real OneLake endpoint."]