Skip to content

Commit d890694

Browse files
authored
Change string encoding to object (#1658)
* change string encoding to object * change object to np.object_, fix get_zarr_compression, sanitize_dtype, and no auto_chunk for object type * fix test_add_location_lat_lon_missing_all_NaN_errors * add test_add_depth_EK_with_platform_vertical_offsets * update test_nbytes * fix test_ek_depth_utils_group_variable_NaNs_logger_warnings
1 parent b34a810 commit d890694

4 files changed

Lines changed: 21 additions & 18 deletions

File tree

echopype/tests/consolidate/test_add_depth.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ def test_ek_depth_utils_group_variable_NaNs_logger_warnings(caplog, ek80_path):
248248
ds_Sv = ep.calibrate.compute_Sv(ed, waveform_mode="CW", encode_mode="power")
249249

250250
# Set first index of group variables to NaN
251-
ed["Platform"]["water_level"].values = np.nan # Is a scalar
251+
ed["Platform"]["water_level"][...] = np.nan # Is a scalar
252252
ed["Platform"]["vertical_offset"].values[0] = np.nan
253253
ed["Platform"]["transducer_offset_z"].values[0] = np.nan
254254
ed["Platform"]["pitch"].values[0] = np.nan
@@ -443,7 +443,8 @@ def test_add_depth_EK_with_platform_vertical_offsets(relpath, sonar_model, compu
443443
ds_Sv_with_depth["depth"].data,
444444
(ds_Sv["echo_range"] + transducer_depth).data,
445445
rtol=1e-10,
446-
atol=1e-10
446+
atol=1e-10,
447+
equal_nan=True
447448
)
448449

449450

echopype/tests/consolidate/test_add_location.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -328,16 +328,12 @@ def test_add_location_lat_lon_missing_all_NaN_errors(
328328
if error_type == "missing":
329329
ed["Platform"] = ed["Platform"].drop_vars(f"longitude_{datagram_type.lower()}")
330330
elif error_type == "all_nan":
331-
ed["Platform"][f"latitude_{datagram_type.lower()}"].data = (
332-
[np.nan] * len(ed["Platform"][f"latitude_{datagram_type.lower()}"])
333-
)
331+
ed["Platform"][f"latitude_{datagram_type.lower()}"].data[:] = np.nan
334332
else:
335333
if error_type == "missing":
336334
ed["Platform"] = ed["Platform"].drop_vars("longitude")
337335
if error_type == "all_nan":
338-
ed["Platform"]["latitude"].data = (
339-
[np.nan] * len(ed["Platform"]["latitude"])
340-
)
336+
ed["Platform"]["latitude"].data[:] = np.nan
341337

342338
# Check if the expected error is logged
343339
with pytest.raises(ValueError) as exc_info:

echopype/tests/echodata/test_echodata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ def test_group_paths(self, converted_zarr):
243243
def test_nbytes(self, converted_zarr):
244244
ed = self.create_ed(converted_zarr)
245245
assert isinstance(ed.nbytes, float)
246-
assert ed.nbytes == 4690060.0
246+
assert ed.nbytes == 4687964.0
247247

248248
def test_repr(self, converted_zarr):
249249
zarr_path_string = str(converted_zarr.absolute())

echopype/utils/coding.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
"zarr": {
2020
"float": {"compressors": [BloscCodec(cname="zstd", clevel=3, shuffle="bitshuffle")]},
2121
"int": {"compressors": [BloscCodec(cname="lz4", clevel=5, shuffle="shuffle", blocksize=0)]},
22-
"string": {
22+
"object": {
2323
"compressors": [BloscCodec(cname="lz4", clevel=5, shuffle="shuffle", blocksize=0)]
2424
},
2525
"time": {
@@ -43,9 +43,9 @@
4343

4444

4545
EXPECTED_VAR_DTYPE = {
46-
"channel": np.str_,
47-
"cal_channel_id": np.str_,
48-
"beam": np.str_,
46+
"channel": np.object_,
47+
"cal_channel_id": np.object_,
48+
"beam": np.object_,
4949
"channel_mode": np.float32,
5050
"beam_stabilisation": np.byte,
5151
"non_quantitative_processing": np.int16,
@@ -63,8 +63,8 @@ def sanitize_dtypes(ds: xr.Dataset) -> xr.Dataset:
6363
if name in EXPECTED_VAR_DTYPE:
6464
expected_dtype = EXPECTED_VAR_DTYPE[name]
6565
elif np.issubdtype(var.dtype, np.object_):
66-
# Defaulting to strings dtype for object data types
67-
expected_dtype = np.str_
66+
# Defaulting to variable-length UTF-8 string (object) for object data types
67+
expected_dtype = np.object_
6868
else:
6969
# For everything else, this should be the same
7070
expected_dtype = var.dtype
@@ -119,7 +119,13 @@ def _get_dask_auto_chunk(
119119
tuple
120120
The chunks
121121
"""
122-
# Create a tuple filled with "auto" for each dimension in the variable's shape.
122+
# Create a tuple filled with "auto" for each dimension in the variable's shape
123+
# For object dtype (e.g., variable-length strings), Dask cannot auto-chunk
124+
if np.issubdtype(variable.dtype, np.object_):
125+
# Return a single chunk for each dimension (i.e., unchunked)
126+
return {dim: size for dim, size in variable.sizes.items()}
127+
128+
# Otherwise, use Dask's auto_chunks for numeric/fixed-size types
123129
auto_tuple = tuple("auto" for _ in variable.shape)
124130

125131
# Generate a tuple of chunk sizes using the dask 'auto_chunks' function.
@@ -162,8 +168,8 @@ def get_zarr_compression(var: xr.Variable, compression_settings: dict) -> dict:
162168
return compression_settings["float"]
163169
elif np.issubdtype(var.dtype, np.integer):
164170
return compression_settings["int"]
165-
elif np.issubdtype(var.dtype, np.str_):
166-
return compression_settings["string"]
171+
elif np.issubdtype(var.dtype, np.str_) or np.issubdtype(var.dtype, object):
172+
return compression_settings["object"]
167173
elif np.issubdtype(var.dtype, np.datetime64):
168174
return compression_settings["time"]
169175
else:

0 commit comments

Comments
 (0)