From 8582979f242da1ee89baef0b88f07ecab52574a3 Mon Sep 17 00:00:00 2001
From: adkinsrs <sadkins@som.umaryland.edu>
Date: Wed, 10 Jun 2026 15:54:34 -0400
Subject: [PATCH 1/2] (WIP) Enhance CosMxHandler to process Xenium spatial data
 tarballs, including file extraction and gene ID updates; update properties
 for region ID and name.

---
 lib/gear/spatialhandler.py | 113 +++++++++++++++++++++++++++++++++----
 1 file changed, 103 insertions(+), 10 deletions(-)

diff --git a/lib/gear/spatialhandler.py b/lib/gear/spatialhandler.py
index 5b903862..1bb8465d 100644
--- a/lib/gear/spatialhandler.py
+++ b/lib/gear/spatialhandler.py
@@ -651,6 +651,7 @@ class CosMxHandler(SpatialHandler):
     * `<dataset_id>_`'exprMat_file.csv'`: Counts matrix.
     * `<dataset_id>_`'metadata_file.csv'`: Metadata file.
     * `<dataset_id>_`'fov_positions_file.csv'`: Field of view file.
+    * (Optional) `<dataset_id>_`'tx_file.csv'`: Transcripts file
     * 'CellComposite': Directory containing the images.
     * 'CellLabels': Directory containing the labels.
     """
@@ -663,17 +664,17 @@ def has_images(self) -> bool:
     @property
     def coordinate_system(self) -> str:
         """Returns the coordinate system used by CosMx datasets."""
-        return "global"
+        return "global" # may also be "spatial"
 
     @property
     def region_id(self) -> str:
         """Returns the region ID used for spot data."""
-        return "instance_id"
+        return "cell_ID"
 
     @property
     def region_name(self) -> str:
         """Returns the name of the region used for spot data."""
-        return "locations"
+        return "fov_labels"
 
     @property
     def platform(self) -> str:
@@ -687,11 +688,87 @@ def img_name(self) -> str | None:
 
     def process_file(self, filepath: str, **kwargs) -> "SpatialHandler":
         """
-        Reads and processes a CosMx spatial data file from the given filepath.
-        For CosMx, this is a stub and does not perform any operation.
+        Reads and processes a Xenium spatial data tarball from the given filepath.
+        Extracts required files, loads clustering and spatial data, updates gene IDs, and loads into a SpatialData object.
         """
-        return self
+        extract_dir = kwargs.get("extract_dir", '/tmp/')
+        extract_dir = os.path.join(extract_dir, 'files')
+
+        if filepath.endswith(".tar.gz"):
+            mode = "r:gz"  # Read as gzipped tar file
+        elif filepath.endswith(".tar"):
+            mode = "r"     # Read as plain tar file
+        else:
+            raise Exception("File must be a .tar or .tar.gz file.")
+
+        _remove_dir(extract_dir)
+
+        transcripts_present = False
+
+        with tarfile.open(filepath, mode) as tf:
+            for entry in tf:
+                # Skip any BSD tar artifacts, like files that start with ._ or .DS_Store
+                if ".DS_Store" in entry.name or "._" in entry.name:
+                    continue
+
+                # IF file is gzipped, gunzip it
+                if entry.name.endswith(".gz"):
+                    entry_io = tf.extractfile(entry)
+                    if entry_io is None:
+                        raise Exception("Error occurred while extracting file: ", entry.name)
+                    with entry_io as f:
+                        with open(os.path.join(extract_dir, entry.name[:-3]), "wb") as out_f:
+                            out_f.write(f.read())
+                    entry.name = entry.name[:-3]    # Adjust file name
+
+                if entry.name.endswith("tx_file.csv"):
+                    transcripts_present = True
+
+                # Extract file into tmp dir
+                filepath = "{0}/{1}".format(extract_dir, entry.name)
+                tf.extract(entry, path=extract_dir)
+
+        # If clustering file does not exist, raise an exception
+        clustering_csv_path = "{}/clusters.csv".format(extract_dir)
+        if not os.path.exists(clustering_csv_path):
+            raise Exception("clusters.csv file not found in tarball.")
+
+        # If clustering file does not have "Barcode" and "Cluster" columns, raise an exception
+        #with open(clustering_csv_path, 'r') as f:
+        #    first_line = f.readline()
+        #    if "Barcode" not in first_line or "Cluster" not in first_line:
+        #        raise Exception("clusters.csv file does not have 'Barcode' and 'Cluster' columns in clusters.csv file in tarball.")
+
+        try:
+            sdata = sdio.cosmx(extract_dir
+                                    , dataset_id="spatialdata"   # Provide a name to standarize downstream usage
+                                    , transcripts=transcripts_present
+                                    )
+        except Exception:
+            raise
+
+        # add clustering information to the vis_sdata.table.obs dataframe
+        #clustering = pd.read_csv(clustering_csv_path)
+        # make barcode as index
+        #clustering = clustering.set_index('Barcode')
+        #sdata.tables[self.NORMALIZED_TABLE_NAME].obs['clusters'] = clustering['Cluster'].astype('category')
+        # If all clusters are missing, raise an exception
+        #if sdata.tables[self.NORMALIZED_TABLE_NAME].obs['clusters'].isna().all():
+        #    raise Exception("All cluster values are missing in clusters.csv file in tarball.")
 
+        # The Space Ranger h5 matrix has the gene names as the index, need to move them to a column and set the index to the ensembl id
+        sdata.tables[self.NORMALIZED_TABLE_NAME].var_names_make_unique()
+
+        # currently gene symbols are the index, need to move them to a column
+        sdata.tables[self.NORMALIZED_TABLE_NAME].var["gene_symbol"] = sdata.tables[self.NORMALIZED_TABLE_NAME].var.index
+
+        # set the index to the ensembl id (gene_ids)
+        sdata.tables[self.NORMALIZED_TABLE_NAME].var = sdata.tables[self.NORMALIZED_TABLE_NAME].var.set_index("gene_ids")
+
+        self.sdata = sdata
+        self.standardize_sdata()
+        self.originalFile = filepath
+        return self
 
 class CurioHandler(SpatialHandler):
     """
@@ -810,7 +887,10 @@ def process_file(self, filepath: str, **kwargs) -> "SpatialHandler":
         var_features_moransi.to_csv(spatial_moransi_file, sep="\t", header=True, index=True, index_label=False)
 
         # Now are ready to read in to a SpatialData object
-        sdata = sdio.curio(extract_dir)
+        try:
+            sdata = sdio.curio(extract_dir)
+        except Exception:
+            raise
 
         # To get the adata equivalent, look at sdata.tables["table"]
 
@@ -1073,7 +1153,10 @@ def process_file(self, filepath: str, **kwargs) -> "SpatialHandler":
             if "Barcode" not in first_line or "Cluster" not in first_line:
                 raise Exception("clusters.csv file does not have 'Barcode' and 'Cluster' columns in clusters.csv file in tarball.")
 
-        sdata = sdio.visium(path=extract_dir, dataset_id="spatialdata")    # Provide a name to standarize downstream usage
+        try:
+            sdata = sdio.visium(path=extract_dir, dataset_id="spatialdata")    # Provide a name to standarize downstream usage
+        except Exception:
+            raise
 
         # add clustering information to the vis_sdata.table.obs dataframe
         clustering = pd.read_csv(clustering_csv_path)
@@ -1151,6 +1234,10 @@ def img_name(self) -> str | None:
         return "spatialdata_hires_image"
 
     def process_file(self, filepath: str, **kwargs) -> "SpatialHandler":
+        """
+        Reads and processes a Xenium spatial data tarball from the given filepath.
+        Extracts required files, loads clustering and spatial data, updates gene IDs, and loads into a SpatialData object.
+        """
         extract_dir = kwargs.get("extract_dir", '/tmp/')
         extract_dir = os.path.join(extract_dir, 'files')
 
@@ -1203,7 +1290,8 @@ def process_file(self, filepath: str, **kwargs) -> "SpatialHandler":
         if not os.path.exists("{}/spatialdata_feature_slice.h5".format(binned_outputs_dir)):
             os.symlink("{}/feature_slice.h5".format(absolute_path), "{}/spatialdata_feature_slice.h5".format(binned_outputs_dir))
 
-        sdata = sdio.visium_hd(binned_outputs_dir
+        try:
+            sdata = sdio.visium_hd(binned_outputs_dir
                                 , dataset_id="spatialdata"   # Provide a name to standarize downstream usage
                                 , bin_size=8
                                 , filtered_counts_file=True
@@ -1211,6 +1299,8 @@ def process_file(self, filepath: str, **kwargs) -> "SpatialHandler":
                                 , fullres_image_file=None
                                 , bins_as_squares=True
                                 )
+        except Exception:
+            raise
 
         # add clustering information to the vis_sdata.table.obs dataframe
         clustering = pd.read_csv(clustering_csv_path)
@@ -1340,7 +1430,8 @@ def process_file(self, filepath: str, **kwargs) -> "SpatialHandler":
             if "Barcode" not in first_line or "Cluster" not in first_line:
                 raise Exception("clusters.csv file does not have 'Barcode' and 'Cluster' columns in clusters.csv file in tarball.")
 
-        sdata = sdio.xenium(extract_dir
+        try:
+            sdata = sdio.xenium(extract_dir
                             , cells_labels=False # Avoid adding polygons to SpatialData object (for now due to out-of-memory issues)
                             , nucleus_labels=False
                             , cell_boundaries=cell_boundaries_present
@@ -1349,6 +1440,8 @@ def process_file(self, filepath: str, **kwargs) -> "SpatialHandler":
                             , cells_as_circles=True  # Table is associated with the cells instead of the nuclei (faster performance)
                             , morphology_mip=False   # Using the morphology_focus image instead
                             )
+        except Exception:
+            raise
 
         # In code, it seems that the Xenium reader is supposed to set the index to the "barcodes" column
         # But this column is not found, so we need to manually replace with "cell_id"

From da9e0555e8cc123d131f0205a7fa019a567764ee Mon Sep 17 00:00:00 2001
From: adkinsrs <sadkins@som.umaryland.edu>
Date: Tue, 23 Jun 2026 14:26:17 -0400
Subject: [PATCH 2/2] (WIP) Enhance CosMxHandler to standardize file naming and
 improve metadata handling; rename columns in observation table and ensure
 organism ID retrieval from dataset metadata.

---
 lib/gear/spatialhandler.py | 41 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/lib/gear/spatialhandler.py b/lib/gear/spatialhandler.py
index 1bb8465d..25092e98 100644
--- a/lib/gear/spatialhandler.py
+++ b/lib/gear/spatialhandler.py
@@ -721,14 +721,46 @@ def process_file(self, filepath: str, **kwargs) -> "SpatialHandler":
                             out_f.write(f.read())
                     entry.name = entry.name[:-3]    # Adjust file name
 
+                # ? We could include this to use the "points" for future additions, but not including it saves space in the output Zarr
                 if entry.name.endswith("tx_file.csv"):
                     transcripts_present = True
 
+                # For the exprMat_file.csv, fov_positions_file.csv, and metadata_file.csv files, replace the dataset_id prefix with "spatialdata" to standardize downstream usage
+                if any(entry.name.endswith(suffix) for suffix in ["exprMat_file.csv", "fov_positions_file.csv", "metadata_file.csv"]):
+                    new_name = entry.name.split("_", 1)[-1]  # Remove the dataset_id prefix
+                    new_name = "spatialdata_" + new_name    # Add the standard prefix
+                    entry.name = new_name
+
+                # For the CellComposite or CellLabels directories, strip off the dataset_id prefix to standardize downstream usage
+                if any(entry.name.startswith(prefix) for prefix in ["CellComposite", "CellLabels"]):
+                    new_name = entry.name.split("_", 1)[-1]  # Remove the dataset_id prefix
+                    entry.name = new_name
+
                 # Extract file into tmp dir
                 filepath = "{0}/{1}".format(extract_dir, entry.name)
                 tf.extract(entry, path=extract_dir)
 
+        # Try to get organism id directly or through dataset metadata
+        organism_id = kwargs.get("organism_id", None)
+        if organism_id is None and "dataset_id" in kwargs:
+            from geardb import get_dataset_by_id
+            dataset = get_dataset_by_id(kwargs.get("dataset_id"))   # assumes the metadata is already present
+            if dataset:
+                organism_id = dataset.organism_id
+        if organism_id is None:
+            raise Exception("Organism ID not found in dataset metadata or provided as an argument.")
+
+        # In the metadata_file.csv file, rename the "cell_id" column if it exists, as it is redundant with the "cell_ID" column
+        metadata_csv_path = "{}/metadata_file.csv".format(extract_dir)
+        if os.path.exists(metadata_csv_path):
+            metadata_df = pd.read_csv(metadata_csv_path)
+            if "cell_id" in metadata_df.columns:
+                metadata_df = metadata_df.rename(columns={"cell_id": "orig_cell_id"})
+                metadata_df.to_csv(metadata_csv_path, index=False)
+
         # If clustering file does not exist, raise an exception
+        # TODO: Figure out how to implement this
+        # Barcode needs to be a combination of the following values <cell_ID>_<fov>
         clustering_csv_path = "{}/clusters.csv".format(extract_dir)
         if not os.path.exists(clustering_csv_path):
             raise Exception("clusters.csv file not found in tarball.")
@@ -762,8 +794,13 @@ def process_file(self, filepath: str, **kwargs) -> "SpatialHandler":
         # currently gene symbols are the index, need to move them to a column
         sdata.tables[self.NORMALIZED_TABLE_NAME].var["gene_symbol"] = sdata.tables[self.NORMALIZED_TABLE_NAME].var.index
 
-        # set the index to the ensembl id (gene_ids)
-        sdata.tables[self.NORMALIZED_TABLE_NAME].var = sdata.tables[self.NORMALIZED_TABLE_NAME].var.set_index("gene_ids")
+        # Add ensemble IDs to the adata.var
+        sdata.tables[self.NORMALIZED_TABLE_NAME] = update_adata_with_ensembl_ids(sdata.tables[self.NORMALIZED_TABLE_NAME], organism_id, "UNMAPPED_")
+
+        # Rename the "CenterX_global_px" column to "spatial1" and the "CenterY_global_px" column to "spatial2" in the observation table
+        sdata.tables[self.NORMALIZED_TABLE_NAME].obs = sdata.tables[self.NORMALIZED_TABLE_NAME].obs.rename(
+            columns={"CenterX_global_px": "spatial1", "CenterY_global_px": "spatial2"}
+            )
 
         self.sdata = sdata
         self.standardize_sdata()