@@ -487,6 +487,12 @@ def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> N
487487 dataset .metadata .zarr_format = format_version
488488
489489 def sniff (self , filename : str ) -> bool :
490+ """
491+ >>> from galaxy.datatypes.sniff import get_test_fname
492+ >>> fname = get_test_fname('Images.zarr.zip')
493+ >>> CompressedZarrZipArchive().sniff(fname)
494+ True
495+ """
490496 # Check if the zip file contains a zarr store.
491497 # In theory, the zarr store must be in the root of the zip file.
492498 # See: https://github.com/zarr-developers/zarr-python/issues/756#issuecomment-852134901
@@ -4874,6 +4880,322 @@ def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> N
48744880 dataset .metadata .version = struct .unpack ("<i" , header_bytes [4 :8 ])[0 ]
48754881
48764882
4883+ class SpatialData (CompressedZarrZipArchive ):
4884+ """
4885+ Class for SpatialData file: https://spatialdata.scverse.org/
4886+
4887+ SpatialData: an open and universal framework for processing spatial omics data.
4888+ SpatialData aims at implementing a performant in-memory representation in Python
4889+ and an on-disk representation based on the Zarr and Parquet data formats
4890+ and following, when applicable, the OME-NGFF specification
4891+
4892+ The format stores multi-modal spatial omics datasets including:
4893+ - Images (2D/3D multi-scale)
4894+ - Labels (segmentation masks)
4895+ - Shapes (polygons, circles)
4896+ - Points (transcript locations, point clouds)
4897+ - Tables (annotations)
4898+ """
4899+
4900+ file_ext = "spatialdata.zip"
4901+
4902+ # Minimal metadata for elements
4903+ MetadataElement (
4904+ name = "images_count" ,
4905+ desc = "Number of SpatialData image elements" ,
4906+ default = 0 ,
4907+ readonly = True ,
4908+ visible = True ,
4909+ no_value = 0 ,
4910+ )
4911+
4912+ MetadataElement (
4913+ name = "labels_count" ,
4914+ desc = "Number of SpatialData label elements" ,
4915+ default = 0 ,
4916+ readonly = True ,
4917+ visible = True ,
4918+ no_value = 0 ,
4919+ )
4920+
4921+ MetadataElement (
4922+ name = "shapes_count" ,
4923+ desc = "Number of SpatialData shape elements" ,
4924+ default = 0 ,
4925+ readonly = True ,
4926+ visible = True ,
4927+ no_value = 0 ,
4928+ )
4929+
4930+ MetadataElement (
4931+ name = "points_count" ,
4932+ desc = "Number of SpatialData point elements" ,
4933+ default = 0 ,
4934+ readonly = True ,
4935+ visible = True ,
4936+ no_value = 0 ,
4937+ )
4938+
4939+ MetadataElement (
4940+ name = "tables" ,
4941+ desc = "SpatialData table elements" ,
4942+ default = [],
4943+ param = metadata .SelectParameter ,
4944+ multiple = True ,
4945+ readonly = True ,
4946+ visible = True ,
4947+ )
4948+
4949+ MetadataElement (
4950+ name = "table_shapes" ,
4951+ desc = "SpatialData table shapes (n_obs, n_vars)" ,
4952+ default = {},
4953+ param = metadata .DictParameter ,
4954+ readonly = True ,
4955+ visible = False ,
4956+ )
4957+
4958+ MetadataElement (
4959+ name = "coordinate_systems" ,
4960+ desc = "SpatialData coordinate systems" ,
4961+ default = [],
4962+ param = metadata .SelectParameter ,
4963+ multiple = True ,
4964+ readonly = True ,
4965+ visible = True ,
4966+ )
4967+
4968+ MetadataElement (
4969+ name = "spatialdata_version" ,
4970+ desc = "SpatialData software version" ,
4971+ default = "" ,
4972+ readonly = True ,
4973+ visible = True ,
4974+ no_value = "" ,
4975+ )
4976+
4977+ def set_peek (self , dataset : DatasetProtocol , ** kwd ) -> None :
4978+ if not dataset .dataset .purged :
4979+ # Try to make a metadata like spatialdata file itself
4980+ peek_lines = ["SpatialData object" ]
4981+
4982+ # Show zarr format if available
4983+ if dataset .metadata .zarr_format :
4984+ peek_lines [0 ] += f" (Zarr Format v{ dataset .metadata .zarr_format } )"
4985+
4986+ # Show counts for each element type
4987+ if getattr (dataset .metadata , "images_count" , 0 ):
4988+ peek_lines .append (f"├── Images ({ dataset .metadata .images_count } )" )
4989+
4990+ if getattr (dataset .metadata , "labels_count" , 0 ):
4991+ peek_lines .append (f"├── Labels ({ dataset .metadata .labels_count } )" )
4992+
4993+ if getattr (dataset .metadata , "shapes_count" , 0 ):
4994+ peek_lines .append (f"├── Shapes ({ dataset .metadata .shapes_count } )" )
4995+
4996+ if getattr (dataset .metadata , "points_count" , 0 ):
4997+ peek_lines .append (f"├── Points ({ dataset .metadata .points_count } )" )
4998+
4999+ if dataset .metadata .tables :
5000+ peek_lines .append (f"└── Tables ({ len (dataset .metadata .tables )} )" )
5001+ for tbl in dataset .metadata .tables :
5002+ # Add shape information if available
5003+ if dataset .metadata .table_shapes and tbl in dataset .metadata .table_shapes :
5004+ shape = dataset .metadata .table_shapes [tbl ]
5005+ peek_lines .append (f" └── '{ tbl } ': AnnData { shape } " )
5006+ else :
5007+ peek_lines .append (f" └── '{ tbl } '" )
5008+
5009+ # Show coordinate systems if available
5010+ if dataset .metadata .coordinate_systems :
5011+ peek_lines .append ("" )
5012+ peek_lines .append ("with coordinate systems:" )
5013+ for cs in dataset .metadata .coordinate_systems :
5014+ peek_lines .append (f" • { cs } " )
5015+
5016+ dataset .peek = "\n " .join (peek_lines )
5017+ dataset .blurb = f"SpatialData file ({ nice_size (dataset .get_size ())} )"
5018+ if dataset .metadata .spatialdata_version :
5019+ dataset .blurb += f"\n Version: { dataset .metadata .spatialdata_version } "
5020+ else :
5021+ dataset .peek = "file does not exist"
5022+ dataset .blurb = "file purged from disk"
5023+
5024+ def set_meta (self , dataset : DatasetProtocol , overwrite : bool = True , ** kwd ) -> None :
5025+ super ().set_meta (dataset , overwrite = overwrite , ** kwd )
5026+ try :
5027+ with zipfile .ZipFile (dataset .get_file_name ()) as zf :
5028+ # Initialize element dictionaries to track elements by type
5029+ images = set ()
5030+ labels = set ()
5031+ shapes = set ()
5032+ points = set ()
5033+ tables = set ()
5034+ coordinate_systems = set ()
5035+ spatialdata_version = ""
5036+
5037+ # Find the root zarr directory
5038+ root_zarr = None
5039+ for file in zf .namelist ():
5040+ if file .endswith (".zarr/.zattrs" ):
5041+ root_zarr = file .replace ("/.zattrs" , "" )
5042+ break
5043+
5044+ # Read root attributes for version info
5045+ if root_zarr :
5046+ root_attrs_path = f"{ root_zarr } /.zattrs"
5047+ try :
5048+ with zf .open (root_attrs_path ) as f :
5049+ root_attrs = json .load (f )
5050+ if "spatialdata_attrs" in root_attrs :
5051+ spatialdata_attrs = root_attrs ["spatialdata_attrs" ]
5052+ spatialdata_version = spatialdata_attrs .get ("spatialdata_software_version" , "" )
5053+ except Exception :
5054+ pass
5055+
5056+ # Parse all files to extract elements and coordinate systems
5057+ for file in zf .namelist ():
5058+ # Extract elements based on directory structure
5059+ # Expected structure: <root>.zarr/<element_type>/<element_name>/...
5060+ if root_zarr and file .startswith (root_zarr + "/" ):
5061+ rel_parts = file [len (root_zarr ) + 1 :].split ("/" )
5062+ if len (rel_parts ) >= 2 :
5063+ element_type = rel_parts [0 ]
5064+ element_name = rel_parts [1 ]
5065+
5066+ # Skip metadata files and empty names
5067+ if element_name and not element_name .startswith ("." ):
5068+ if element_type == "images" :
5069+ images .add (element_name )
5070+ elif element_type == "labels" :
5071+ labels .add (element_name )
5072+ elif element_type == "shapes" :
5073+ shapes .add (element_name )
5074+ elif element_type == "points" :
5075+ points .add (element_name )
5076+ elif element_type == "tables" :
5077+ tables .add (element_name )
5078+
5079+ # Extract coordinate system information from .zattrs files
5080+ if file .endswith (".zattrs" ):
5081+ try :
5082+ with zf .open (file ) as f :
5083+ attrs = json .load (f )
5084+
5085+ # Check for coordinate transformations
5086+ if "coordinateTransformations" in attrs :
5087+ transforms = attrs ["coordinateTransformations" ]
5088+ if isinstance (transforms , list ):
5089+ for transform in transforms :
5090+ if isinstance (transform , dict ) and "output" in transform :
5091+ output = transform ["output" ]
5092+ if isinstance (output , dict ) and "name" in output :
5093+ coordinate_systems .add (output ["name" ])
5094+ elif isinstance (output , str ):
5095+ coordinate_systems .add (output )
5096+
5097+ # Check for multiscales (images/labels)
5098+ if "multiscales" in attrs :
5099+ multiscales = attrs ["multiscales" ]
5100+ if isinstance (multiscales , list ):
5101+ for ms in multiscales :
5102+ if isinstance (ms , dict ) and "coordinateTransformations" in ms :
5103+ for ct in ms ["coordinateTransformations" ]:
5104+ if isinstance (ct , dict ) and "output" in ct :
5105+ output = ct ["output" ]
5106+ if isinstance (output , dict ) and "name" in output :
5107+ coordinate_systems .add (output ["name" ])
5108+ elif isinstance (output , str ):
5109+ coordinate_systems .add (output )
5110+
5111+ # Check for spatialdata transform attribute (legacy)
5112+ if "transform" in attrs :
5113+ transform_dict = attrs ["transform" ]
5114+ if isinstance (transform_dict , dict ):
5115+ coordinate_systems .update (transform_dict .keys ())
5116+ except Exception :
5117+ pass
5118+
5119+ # Set metadata: counts for most elements, but keep tables and
5120+ # coordinate system names and table shapes for compatibility.
5121+ dataset .metadata .images_count = len (images )
5122+ dataset .metadata .labels_count = len (labels )
5123+ dataset .metadata .shapes_count = len (shapes )
5124+ dataset .metadata .points_count = len (points )
5125+
5126+ # Preserve table names and shapes (as before)
5127+ dataset .metadata .tables = sorted (tables )
5128+ table_shapes = {}
5129+ for table_name in tables :
5130+ try :
5131+ obs_index_path = f"{ root_zarr } /tables/{ table_name } /obs/_index/.zarray"
5132+ var_index_path = f"{ root_zarr } /tables/{ table_name } /var/_index/.zarray"
5133+ n_obs = None
5134+ n_vars = None
5135+ if obs_index_path in zf .namelist ():
5136+ with zf .open (obs_index_path ) as f :
5137+ obs_array = json .load (f )
5138+ n_obs = obs_array .get ("shape" , [None ])[0 ]
5139+ if var_index_path in zf .namelist ():
5140+ with zf .open (var_index_path ) as f :
5141+ var_array = json .load (f )
5142+ n_vars = var_array .get ("shape" , [None ])[0 ]
5143+ if n_obs is not None and n_vars is not None :
5144+ table_shapes [table_name ] = (n_obs , n_vars )
5145+ except Exception :
5146+ pass
5147+ dataset .metadata .table_shapes = table_shapes
5148+ dataset .metadata .coordinate_systems = sorted (coordinate_systems )
5149+ dataset .metadata .spatialdata_version = spatialdata_version
5150+ except Exception :
5151+ pass
5152+
5153+ def sniff (self , filename : str ) -> bool :
5154+ """
5155+ Check if the file is a valid SpatialData zarr archive.
5156+
5157+ SpatialData files are Zarr archives with specific structure containing
5158+ a root .zattrs file with spatialdata_attrs metadata and element directories
5159+ like images/, labels/, shapes/, points/, or tables/.
5160+
5161+ >>> from galaxy.datatypes.sniff import get_test_fname
5162+ >>> fname = get_test_fname('subsampled_visium.spatialdata.zip')
5163+ >>> SpatialData().sniff(fname)
5164+ True
5165+ >>> fname = get_test_fname('Images.zarr.zip')
5166+ >>> SpatialData().sniff(fname)
5167+ False
5168+ """
5169+
5170+ try :
5171+ with zipfile .ZipFile (filename ) as zf :
5172+ # First, check if this is a zarr archive at all
5173+ if not super ().sniff (filename ):
5174+ return False
5175+
5176+ # Look for the root .zattrs file with spatialdata_attrs.
5177+ # This can distinguish spatialdata from other zarr archives.
5178+ for file in zf .namelist ():
5179+ # Look for .zattrs file at the root of the zarr store
5180+ # The zarr store can be at root or one level deeper
5181+ parts = file .split ("/" )
5182+ # Root level: .zattrs or one level deep: <name>.zarr/.zattrs
5183+ if file == ".zattrs" or (len (parts ) == 2 and parts [0 ].endswith (".zarr" ) and parts [1 ] == ".zattrs" ):
5184+ try :
5185+ with zf .open (file ) as f :
5186+ attrs = json .load (f )
5187+ # Check for SpatialData-specific metadata
5188+ if "spatialdata_attrs" in attrs :
5189+ return True
5190+ except Exception :
5191+ pass
5192+
5193+ return False
5194+ except Exception :
5195+ # Any exception during parsing means it's not a valid spatialdata file
5196+ return False
5197+
5198+
48775199@build_sniff_from_prefix
48785200class Safetensors (Binary ):
48795201 """
0 commit comments