From b0a8b5c7e90b266cb33a466a2992798abd1a700d Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 3 May 2026 14:04:54 -0500 Subject: [PATCH 1/3] Fix get_nearest_continuous: accept scalar targets and missing time column MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The docstring says ``targets`` accepts "anything ``pandas.to_datetime`` consumes", which includes a bare string or ``pd.Timestamp``. But ``pd.to_datetime("2024-01-01T00:00:00Z", utc=True)`` returns a scalar ``Timestamp``, and ``pd.DatetimeIndex(scalar)`` raises ``TypeError`` — so single-value cases crashed despite the documented contract. Wrap a scalar result in a one-element ``DatetimeIndex`` so any ``pandas.to_datetime``-consumable input works. Also: when the user passes ``properties`` that excludes ``time``, the helper used to crash with ``KeyError`` deep inside ``df.assign``. Detect the missing column up front and raise a ``ValueError`` pointing at the likely cause. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/nearest.py | 20 +++++++++++- tests/waterdata_nearest_test.py | 51 ++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/dataretrieval/waterdata/nearest.py b/dataretrieval/waterdata/nearest.py index 68f123ef..e2edea7a 100644 --- a/dataretrieval/waterdata/nearest.py +++ b/dataretrieval/waterdata/nearest.py @@ -139,7 +139,7 @@ def get_nearest_continuous( ... ) """ _check_nearest_kwargs(kwargs, on_tie) - target_index = pd.DatetimeIndex(pd.to_datetime(targets, utc=True)) + target_index = _coerce_targets(targets) window_td = pd.Timedelta(window) if len(target_index) == 0: @@ -153,6 +153,11 @@ def get_nearest_continuous( filter_lang="cql-text", **kwargs, ) + if "time" not in df.columns: + raise ValueError( + "get_nearest_continuous requires a 'time' column in the response; " + "if a `properties` kwarg was passed, include 'time' in it" + ) if df.empty: return _empty_nearest_result(df), md @@ -174,6 +179,19 @@ def get_nearest_continuous( return pd.DataFrame(selected).reset_index(drop=True), md +def _coerce_targets(targets: Any) -> pd.DatetimeIndex: + """Accept anything ``pandas.to_datetime`` consumes, including a single value. + + A bare scalar (string, ``Timestamp``, ``datetime``, …) becomes a + one-element ``DatetimeIndex``; an iterable round-trips through + ``pd.to_datetime`` directly. + """ + parsed = pd.to_datetime(targets, utc=True) + if isinstance(parsed, pd.DatetimeIndex): + return parsed + return pd.DatetimeIndex([parsed]) + + def _check_nearest_kwargs(kwargs: dict[str, Any], on_tie: OnTie) -> None: """Reject kwargs the helper owns; validate ``on_tie``.""" for forbidden in ("time", "filter", "filter_lang"): diff --git a/tests/waterdata_nearest_test.py b/tests/waterdata_nearest_test.py index 4dc0ab9d..3f988a6b 100644 --- a/tests/waterdata_nearest_test.py +++ b/tests/waterdata_nearest_test.py @@ -265,3 +265,54 @@ def test_forwards_kwargs_to_get_continuous(patch_get_continuous): _, kwargs = patch_get_continuous.call_args assert kwargs["statistic_id"] == "00011" assert kwargs["approval_status"] == "Approved" + + +def test_accepts_single_string_target(patch_get_continuous): + """A bare scalar target must round-trip through pd.to_datetime. + + Regression: previously `pd.DatetimeIndex(pd.to_datetime("...", utc=True))` + raised TypeError because pd.to_datetime returns a scalar Timestamp for a + single-string input. + """ + patch_get_continuous.return_value = ( + _fake_df([{"time": "2023-06-15T10:30:00Z", "value": 22.4}]), + mock.Mock(), + ) + result, _ = get_nearest_continuous( + "2023-06-15T10:30:31Z", monitoring_location_id="USGS-02238500" + ) + assert len(result) == 1 + assert result["target_time"].iloc[0] == pd.Timestamp( + "2023-06-15T10:30:31Z", tz="UTC" + ) + + +def test_accepts_single_timestamp_target(patch_get_continuous): + """A single ``pd.Timestamp`` target also round-trips.""" + patch_get_continuous.return_value = ( + _fake_df([{"time": "2023-06-15T10:30:00Z", "value": 22.4}]), + mock.Mock(), + ) + target = pd.Timestamp("2023-06-15T10:30:31Z", tz="UTC") + result, _ = get_nearest_continuous(target, monitoring_location_id="USGS-02238500") + assert len(result) == 1 + + +def test_missing_time_column_raises_helpful_error(patch_get_continuous): + """If the response has no 'time' column (e.g. user passed `properties` + that excluded it), raise ValueError instead of crashing with KeyError. + """ + df_no_time = pd.DataFrame( + { + "value": [22.4], + "monitoring_location_id": ["USGS-02238500"], + } + ) + patch_get_continuous.return_value = (df_no_time, mock.Mock()) + + with pytest.raises(ValueError, match="'time' column"): + get_nearest_continuous( + ["2023-06-15T10:30:31Z"], + monitoring_location_id="USGS-02238500", + properties=["value", "monitoring_location_id"], + ) From 2df9743222ad3c92d1b9217010285a67eef313f7 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 4 May 2026 10:09:57 -0500 Subject: [PATCH 2/3] Preserve list-like target inputs and avoid double-tz in test Per copilot review on PR #251: - _coerce_targets: detect non-DatetimeIndex iterables (Series, ndarray) via pd.api.types.is_scalar so the elements are preserved instead of being wrapped in a single-element list. Add a regression test passing a pd.Series of two timestamps and assert both are processed. - Tests: drop the redundant tz='UTC' on pd.Timestamp inputs that already carry a Z suffix; pandas 2.x raises on double timezone specification. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/nearest.py | 8 +++++--- tests/waterdata_nearest_test.py | 22 ++++++++++++++++++---- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/dataretrieval/waterdata/nearest.py b/dataretrieval/waterdata/nearest.py index e2edea7a..ecb70c91 100644 --- a/dataretrieval/waterdata/nearest.py +++ b/dataretrieval/waterdata/nearest.py @@ -183,13 +183,15 @@ def _coerce_targets(targets: Any) -> pd.DatetimeIndex: """Accept anything ``pandas.to_datetime`` consumes, including a single value. A bare scalar (string, ``Timestamp``, ``datetime``, …) becomes a - one-element ``DatetimeIndex``; an iterable round-trips through - ``pd.to_datetime`` directly. + one-element ``DatetimeIndex``; an iterable (list, ``Series``, ``ndarray``) + is wrapped directly so its elements are preserved. """ parsed = pd.to_datetime(targets, utc=True) if isinstance(parsed, pd.DatetimeIndex): return parsed - return pd.DatetimeIndex([parsed]) + if pd.api.types.is_scalar(parsed): + return pd.DatetimeIndex([parsed]) + return pd.DatetimeIndex(parsed) def _check_nearest_kwargs(kwargs: dict[str, Any], on_tie: OnTie) -> None: diff --git a/tests/waterdata_nearest_test.py b/tests/waterdata_nearest_test.py index 3f988a6b..64deeccd 100644 --- a/tests/waterdata_nearest_test.py +++ b/tests/waterdata_nearest_test.py @@ -282,9 +282,7 @@ def test_accepts_single_string_target(patch_get_continuous): "2023-06-15T10:30:31Z", monitoring_location_id="USGS-02238500" ) assert len(result) == 1 - assert result["target_time"].iloc[0] == pd.Timestamp( - "2023-06-15T10:30:31Z", tz="UTC" - ) + assert result["target_time"].iloc[0] == pd.Timestamp("2023-06-15T10:30:31Z") def test_accepts_single_timestamp_target(patch_get_continuous): @@ -293,11 +291,27 @@ def test_accepts_single_timestamp_target(patch_get_continuous): _fake_df([{"time": "2023-06-15T10:30:00Z", "value": 22.4}]), mock.Mock(), ) - target = pd.Timestamp("2023-06-15T10:30:31Z", tz="UTC") + target = pd.Timestamp("2023-06-15T10:30:31Z") result, _ = get_nearest_continuous(target, monitoring_location_id="USGS-02238500") assert len(result) == 1 +def test_accepts_pandas_series_targets(patch_get_continuous): + """A ``pd.Series`` of timestamps preserves all elements (not just the first).""" + patch_get_continuous.return_value = ( + _fake_df( + [ + {"time": "2023-06-15T10:30:00Z", "value": 22.4}, + {"time": "2023-06-16T10:30:00Z", "value": 22.5}, + ] + ), + mock.Mock(), + ) + targets = pd.Series(["2023-06-15T10:30:31Z", "2023-06-16T10:30:31Z"]) + result, _ = get_nearest_continuous(targets, monitoring_location_id="USGS-02238500") + assert len(result) == 2 + + def test_missing_time_column_raises_helpful_error(patch_get_continuous): """If the response has no 'time' column (e.g. user passed `properties` that excluded it), raise ValueError instead of crashing with KeyError. From 001b051accd289858dd2a4c4f987f12fc5ace7d6 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Thu, 25 Jun 2026 09:21:59 -0500 Subject: [PATCH 3/3] refactor(waterdata): collapse _coerce_targets to a single scalar guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `isinstance(parsed, pd.DatetimeIndex)` early-return was redundant: `pd.DatetimeIndex(parsed)` returns an equivalent index for an already- parsed DatetimeIndex, so the three branches reduce to one scalar guard that wraps a bare value in a list before constructing the index. Behavior is unchanged for every input kind (DatetimeIndex, scalar string/Timestamp, list, Series, ndarray, empty) — the existing nearest tests cover them all. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01Sjb14HkwuCydKSKMsaXsgd --- dataretrieval/waterdata/nearest.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dataretrieval/waterdata/nearest.py b/dataretrieval/waterdata/nearest.py index ecb70c91..edf1a912 100644 --- a/dataretrieval/waterdata/nearest.py +++ b/dataretrieval/waterdata/nearest.py @@ -187,10 +187,8 @@ def _coerce_targets(targets: Any) -> pd.DatetimeIndex: is wrapped directly so its elements are preserved. """ parsed = pd.to_datetime(targets, utc=True) - if isinstance(parsed, pd.DatetimeIndex): - return parsed if pd.api.types.is_scalar(parsed): - return pd.DatetimeIndex([parsed]) + parsed = [parsed] return pd.DatetimeIndex(parsed)