Change string encoding to object (#1658)

leewujung · web-flow · commit d890694a2fed · 2026-04-20T14:30:07.000-07:00
* change string encoding to object

* change object to np.object_, fix get_zarr_compression, sanitize_dtype, and no auto_chunk for object type

* fix test_add_location_lat_lon_missing_all_NaN_errors

* add test_add_depth_EK_with_platform_vertical_offsets

* update test_nbytes

* fix test_ek_depth_utils_group_variable_NaNs_logger_warnings
diff --git a/echopype/tests/consolidate/test_add_depth.py b/echopype/tests/consolidate/test_add_depth.py
@@ -248,7 +248,7 @@ def test_ek_depth_utils_group_variable_NaNs_logger_warnings(caplog, ek80_path):
     ds_Sv = ep.calibrate.compute_Sv(ed, waveform_mode="CW", encode_mode="power")
 
     # Set first index of group variables to NaN
-    ed["Platform"]["water_level"].values = np.nan # Is a scalar
+    ed["Platform"]["water_level"][...] = np.nan # Is a scalar
     ed["Platform"]["vertical_offset"].values[0] = np.nan
     ed["Platform"]["transducer_offset_z"].values[0] = np.nan
     ed["Platform"]["pitch"].values[0] = np.nan
@@ -443,7 +443,8 @@ def test_add_depth_EK_with_platform_vertical_offsets(relpath, sonar_model, compu
         ds_Sv_with_depth["depth"].data,
         (ds_Sv["echo_range"] + transducer_depth).data,
         rtol=1e-10,
-        atol=1e-10
+        atol=1e-10,
+        equal_nan=True
     )
 
 
diff --git a/echopype/tests/consolidate/test_add_location.py b/echopype/tests/consolidate/test_add_location.py
@@ -328,16 +328,12 @@ def test_add_location_lat_lon_missing_all_NaN_errors(
         if error_type == "missing":
             ed["Platform"] = ed["Platform"].drop_vars(f"longitude_{datagram_type.lower()}")
         elif error_type == "all_nan":
-            ed["Platform"][f"latitude_{datagram_type.lower()}"].data = (
-                [np.nan] * len(ed["Platform"][f"latitude_{datagram_type.lower()}"])
-            )
+            ed["Platform"][f"latitude_{datagram_type.lower()}"].data[:] = np.nan
     else:
         if error_type == "missing":
             ed["Platform"] = ed["Platform"].drop_vars("longitude")
         if error_type == "all_nan":
-            ed["Platform"]["latitude"].data = (
-                [np.nan] * len(ed["Platform"]["latitude"])
-            )
+            ed["Platform"]["latitude"].data[:] = np.nan
 
     # Check if the expected error is logged
     with pytest.raises(ValueError) as exc_info:
diff --git a/echopype/tests/echodata/test_echodata.py b/echopype/tests/echodata/test_echodata.py
@@ -243,7 +243,7 @@ def test_group_paths(self, converted_zarr):
     def test_nbytes(self, converted_zarr):
         ed = self.create_ed(converted_zarr)
         assert isinstance(ed.nbytes, float)
-        assert ed.nbytes == 4690060.0
+        assert ed.nbytes == 4687964.0
 
     def test_repr(self, converted_zarr):
         zarr_path_string = str(converted_zarr.absolute())
diff --git a/echopype/utils/coding.py b/echopype/utils/coding.py
@@ -19,7 +19,7 @@
     "zarr": {
         "float": {"compressors": [BloscCodec(cname="zstd", clevel=3, shuffle="bitshuffle")]},
         "int": {"compressors": [BloscCodec(cname="lz4", clevel=5, shuffle="shuffle", blocksize=0)]},
-        "string": {
+        "object": {
             "compressors": [BloscCodec(cname="lz4", clevel=5, shuffle="shuffle", blocksize=0)]
         },
         "time": {
@@ -43,9 +43,9 @@
 
 
 EXPECTED_VAR_DTYPE = {
-    "channel": np.str_,
-    "cal_channel_id": np.str_,
-    "beam": np.str_,
+    "channel": np.object_,
+    "cal_channel_id": np.object_,
+    "beam": np.object_,
     "channel_mode": np.float32,
     "beam_stabilisation": np.byte,
     "non_quantitative_processing": np.int16,
@@ -63,8 +63,8 @@ def sanitize_dtypes(ds: xr.Dataset) -> xr.Dataset:
             if name in EXPECTED_VAR_DTYPE:
                 expected_dtype = EXPECTED_VAR_DTYPE[name]
             elif np.issubdtype(var.dtype, np.object_):
-                # Defaulting to strings dtype for object data types
-                expected_dtype = np.str_
+                # Defaulting to variable-length UTF-8 string (object) for object data types
+                expected_dtype = np.object_
             else:
                 # For everything else, this should be the same
                 expected_dtype = var.dtype
@@ -119,7 +119,13 @@ def _get_dask_auto_chunk(
     tuple
         The chunks
     """
-    # Create a tuple filled with "auto" for each dimension in the variable's shape.
+    # Create a tuple filled with "auto" for each dimension in the variable's shape
+    # For object dtype (e.g., variable-length strings), Dask cannot auto-chunk
+    if np.issubdtype(variable.dtype, np.object_):
+        # Return a single chunk for each dimension (i.e., unchunked)
+        return {dim: size for dim, size in variable.sizes.items()}
+
+    # Otherwise, use Dask's auto_chunks for numeric/fixed-size types
     auto_tuple = tuple("auto" for _ in variable.shape)
 
     # Generate a tuple of chunk sizes using the dask 'auto_chunks' function.
@@ -162,8 +168,8 @@ def get_zarr_compression(var: xr.Variable, compression_settings: dict) -> dict:
         return compression_settings["float"]
     elif np.issubdtype(var.dtype, np.integer):
         return compression_settings["int"]
-    elif np.issubdtype(var.dtype, np.str_):
-        return compression_settings["string"]
+    elif np.issubdtype(var.dtype, np.str_) or np.issubdtype(var.dtype, object):
+        return compression_settings["object"]
     elif np.issubdtype(var.dtype, np.datetime64):
         return compression_settings["time"]
     else: