From 8582979f242da1ee89baef0b88f07ecab52574a3 Mon Sep 17 00:00:00 2001 From: adkinsrs Date: Wed, 10 Jun 2026 15:54:34 -0400 Subject: [PATCH 1/2] (WIP) Enhance CosMxHandler to process Xenium spatial data tarballs, including file extraction and gene ID updates; update properties for region ID and name. --- lib/gear/spatialhandler.py | 113 +++++++++++++++++++++++++++++++++---- 1 file changed, 103 insertions(+), 10 deletions(-) diff --git a/lib/gear/spatialhandler.py b/lib/gear/spatialhandler.py index 5b903862..1bb8465d 100644 --- a/lib/gear/spatialhandler.py +++ b/lib/gear/spatialhandler.py @@ -651,6 +651,7 @@ class CosMxHandler(SpatialHandler): * `_`'exprMat_file.csv'`: Counts matrix. * `_`'metadata_file.csv'`: Metadata file. * `_`'fov_positions_file.csv'`: Field of view file. + * (Optional) `_`'tx_file.csv'`: Transcripts file * 'CellComposite': Directory containing the images. * 'CellLabels': Directory containing the labels. """ @@ -663,17 +664,17 @@ def has_images(self) -> bool: @property def coordinate_system(self) -> str: """Returns the coordinate system used by CosMx datasets.""" - return "global" + return "global" # may also be "spatial" @property def region_id(self) -> str: """Returns the region ID used for spot data.""" - return "instance_id" + return "cell_ID" @property def region_name(self) -> str: """Returns the name of the region used for spot data.""" - return "locations" + return "fov_labels" @property def platform(self) -> str: @@ -687,11 +688,87 @@ def img_name(self) -> str | None: def process_file(self, filepath: str, **kwargs) -> "SpatialHandler": """ - Reads and processes a CosMx spatial data file from the given filepath. - For CosMx, this is a stub and does not perform any operation. + Reads and processes a Xenium spatial data tarball from the given filepath. + Extracts required files, loads clustering and spatial data, updates gene IDs, and loads into a SpatialData object. """ - return self + extract_dir = kwargs.get("extract_dir", '/tmp/') + extract_dir = os.path.join(extract_dir, 'files') + + if filepath.endswith(".tar.gz"): + mode = "r:gz" # Read as gzipped tar file + elif filepath.endswith(".tar"): + mode = "r" # Read as plain tar file + else: + raise Exception("File must be a .tar or .tar.gz file.") + + _remove_dir(extract_dir) + + transcripts_present = False + + with tarfile.open(filepath, mode) as tf: + for entry in tf: + # Skip any BSD tar artifacts, like files that start with ._ or .DS_Store + if ".DS_Store" in entry.name or "._" in entry.name: + continue + + # IF file is gzipped, gunzip it + if entry.name.endswith(".gz"): + entry_io = tf.extractfile(entry) + if entry_io is None: + raise Exception("Error occurred while extracting file: ", entry.name) + with entry_io as f: + with open(os.path.join(extract_dir, entry.name[:-3]), "wb") as out_f: + out_f.write(f.read()) + entry.name = entry.name[:-3] # Adjust file name + + if entry.name.endswith("tx_file.csv"): + transcripts_present = True + + # Extract file into tmp dir + filepath = "{0}/{1}".format(extract_dir, entry.name) + tf.extract(entry, path=extract_dir) + + # If clustering file does not exist, raise an exception + clustering_csv_path = "{}/clusters.csv".format(extract_dir) + if not os.path.exists(clustering_csv_path): + raise Exception("clusters.csv file not found in tarball.") + + # If clustering file does not have "Barcode" and "Cluster" columns, raise an exception + #with open(clustering_csv_path, 'r') as f: + # first_line = f.readline() + # if "Barcode" not in first_line or "Cluster" not in first_line: + # raise Exception("clusters.csv file does not have 'Barcode' and 'Cluster' columns in clusters.csv file in tarball.") + + try: + sdata = sdio.cosmx(extract_dir + , dataset_id="spatialdata" # Provide a name to standarize downstream usage + , transcripts=transcripts_present + ) + except Exception: + raise + + # add clustering information to the vis_sdata.table.obs dataframe + #clustering = pd.read_csv(clustering_csv_path) + # make barcode as index + #clustering = clustering.set_index('Barcode') + #sdata.tables[self.NORMALIZED_TABLE_NAME].obs['clusters'] = clustering['Cluster'].astype('category') + # If all clusters are missing, raise an exception + #if sdata.tables[self.NORMALIZED_TABLE_NAME].obs['clusters'].isna().all(): + # raise Exception("All cluster values are missing in clusters.csv file in tarball.") + # The Space Ranger h5 matrix has the gene names as the index, need to move them to a column and set the index to the ensembl id + sdata.tables[self.NORMALIZED_TABLE_NAME].var_names_make_unique() + + # currently gene symbols are the index, need to move them to a column + sdata.tables[self.NORMALIZED_TABLE_NAME].var["gene_symbol"] = sdata.tables[self.NORMALIZED_TABLE_NAME].var.index + + # set the index to the ensembl id (gene_ids) + sdata.tables[self.NORMALIZED_TABLE_NAME].var = sdata.tables[self.NORMALIZED_TABLE_NAME].var.set_index("gene_ids") + + self.sdata = sdata + self.standardize_sdata() + self.originalFile = filepath + return self class CurioHandler(SpatialHandler): """ @@ -810,7 +887,10 @@ def process_file(self, filepath: str, **kwargs) -> "SpatialHandler": var_features_moransi.to_csv(spatial_moransi_file, sep="\t", header=True, index=True, index_label=False) # Now are ready to read in to a SpatialData object - sdata = sdio.curio(extract_dir) + try: + sdata = sdio.curio(extract_dir) + except Exception: + raise # To get the adata equivalent, look at sdata.tables["table"] @@ -1073,7 +1153,10 @@ def process_file(self, filepath: str, **kwargs) -> "SpatialHandler": if "Barcode" not in first_line or "Cluster" not in first_line: raise Exception("clusters.csv file does not have 'Barcode' and 'Cluster' columns in clusters.csv file in tarball.") - sdata = sdio.visium(path=extract_dir, dataset_id="spatialdata") # Provide a name to standarize downstream usage + try: + sdata = sdio.visium(path=extract_dir, dataset_id="spatialdata") # Provide a name to standarize downstream usage + except Exception: + raise # add clustering information to the vis_sdata.table.obs dataframe clustering = pd.read_csv(clustering_csv_path) @@ -1151,6 +1234,10 @@ def img_name(self) -> str | None: return "spatialdata_hires_image" def process_file(self, filepath: str, **kwargs) -> "SpatialHandler": + """ + Reads and processes a Xenium spatial data tarball from the given filepath. + Extracts required files, loads clustering and spatial data, updates gene IDs, and loads into a SpatialData object. + """ extract_dir = kwargs.get("extract_dir", '/tmp/') extract_dir = os.path.join(extract_dir, 'files') @@ -1203,7 +1290,8 @@ def process_file(self, filepath: str, **kwargs) -> "SpatialHandler": if not os.path.exists("{}/spatialdata_feature_slice.h5".format(binned_outputs_dir)): os.symlink("{}/feature_slice.h5".format(absolute_path), "{}/spatialdata_feature_slice.h5".format(binned_outputs_dir)) - sdata = sdio.visium_hd(binned_outputs_dir + try: + sdata = sdio.visium_hd(binned_outputs_dir , dataset_id="spatialdata" # Provide a name to standarize downstream usage , bin_size=8 , filtered_counts_file=True @@ -1211,6 +1299,8 @@ def process_file(self, filepath: str, **kwargs) -> "SpatialHandler": , fullres_image_file=None , bins_as_squares=True ) + except Exception: + raise # add clustering information to the vis_sdata.table.obs dataframe clustering = pd.read_csv(clustering_csv_path) @@ -1340,7 +1430,8 @@ def process_file(self, filepath: str, **kwargs) -> "SpatialHandler": if "Barcode" not in first_line or "Cluster" not in first_line: raise Exception("clusters.csv file does not have 'Barcode' and 'Cluster' columns in clusters.csv file in tarball.") - sdata = sdio.xenium(extract_dir + try: + sdata = sdio.xenium(extract_dir , cells_labels=False # Avoid adding polygons to SpatialData object (for now due to out-of-memory issues) , nucleus_labels=False , cell_boundaries=cell_boundaries_present @@ -1349,6 +1440,8 @@ def process_file(self, filepath: str, **kwargs) -> "SpatialHandler": , cells_as_circles=True # Table is associated with the cells instead of the nuclei (faster performance) , morphology_mip=False # Using the morphology_focus image instead ) + except Exception: + raise # In code, it seems that the Xenium reader is supposed to set the index to the "barcodes" column # But this column is not found, so we need to manually replace with "cell_id" From da9e0555e8cc123d131f0205a7fa019a567764ee Mon Sep 17 00:00:00 2001 From: adkinsrs Date: Tue, 23 Jun 2026 14:26:17 -0400 Subject: [PATCH 2/2] (WIP) Enhance CosMxHandler to standardize file naming and improve metadata handling; rename columns in observation table and ensure organism ID retrieval from dataset metadata. --- lib/gear/spatialhandler.py | 41 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/lib/gear/spatialhandler.py b/lib/gear/spatialhandler.py index 1bb8465d..25092e98 100644 --- a/lib/gear/spatialhandler.py +++ b/lib/gear/spatialhandler.py @@ -721,14 +721,46 @@ def process_file(self, filepath: str, **kwargs) -> "SpatialHandler": out_f.write(f.read()) entry.name = entry.name[:-3] # Adjust file name + # ? We could include this to use the "points" for future additions, but not including it saves space in the output Zarr if entry.name.endswith("tx_file.csv"): transcripts_present = True + # For the exprMat_file.csv, fov_positions_file.csv, and metadata_file.csv files, replace the dataset_id prefix with "spatialdata" to standardize downstream usage + if any(entry.name.endswith(suffix) for suffix in ["exprMat_file.csv", "fov_positions_file.csv", "metadata_file.csv"]): + new_name = entry.name.split("_", 1)[-1] # Remove the dataset_id prefix + new_name = "spatialdata_" + new_name # Add the standard prefix + entry.name = new_name + + # For the CellComposite or CellLabels directories, strip off the dataset_id prefix to standardize downstream usage + if any(entry.name.startswith(prefix) for prefix in ["CellComposite", "CellLabels"]): + new_name = entry.name.split("_", 1)[-1] # Remove the dataset_id prefix + entry.name = new_name + # Extract file into tmp dir filepath = "{0}/{1}".format(extract_dir, entry.name) tf.extract(entry, path=extract_dir) + # Try to get organism id directly or through dataset metadata + organism_id = kwargs.get("organism_id", None) + if organism_id is None and "dataset_id" in kwargs: + from geardb import get_dataset_by_id + dataset = get_dataset_by_id(kwargs.get("dataset_id")) # assumes the metadata is already present + if dataset: + organism_id = dataset.organism_id + if organism_id is None: + raise Exception("Organism ID not found in dataset metadata or provided as an argument.") + + # In the metadata_file.csv file, rename the "cell_id" column if it exists, as it is redundant with the "cell_ID" column + metadata_csv_path = "{}/metadata_file.csv".format(extract_dir) + if os.path.exists(metadata_csv_path): + metadata_df = pd.read_csv(metadata_csv_path) + if "cell_id" in metadata_df.columns: + metadata_df = metadata_df.rename(columns={"cell_id": "orig_cell_id"}) + metadata_df.to_csv(metadata_csv_path, index=False) + # If clustering file does not exist, raise an exception + # TODO: Figure out how to implement this + # Barcode needs to be a combination of the following values _ clustering_csv_path = "{}/clusters.csv".format(extract_dir) if not os.path.exists(clustering_csv_path): raise Exception("clusters.csv file not found in tarball.") @@ -762,8 +794,13 @@ def process_file(self, filepath: str, **kwargs) -> "SpatialHandler": # currently gene symbols are the index, need to move them to a column sdata.tables[self.NORMALIZED_TABLE_NAME].var["gene_symbol"] = sdata.tables[self.NORMALIZED_TABLE_NAME].var.index - # set the index to the ensembl id (gene_ids) - sdata.tables[self.NORMALIZED_TABLE_NAME].var = sdata.tables[self.NORMALIZED_TABLE_NAME].var.set_index("gene_ids") + # Add ensemble IDs to the adata.var + sdata.tables[self.NORMALIZED_TABLE_NAME] = update_adata_with_ensembl_ids(sdata.tables[self.NORMALIZED_TABLE_NAME], organism_id, "UNMAPPED_") + + # Rename the "CenterX_global_px" column to "spatial1" and the "CenterY_global_px" column to "spatial2" in the observation table + sdata.tables[self.NORMALIZED_TABLE_NAME].obs = sdata.tables[self.NORMALIZED_TABLE_NAME].obs.rename( + columns={"CenterX_global_px": "spatial1", "CenterY_global_px": "spatial2"} + ) self.sdata = sdata self.standardize_sdata()