@@ -487,6 +487,12 @@ def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> N
487487 dataset .metadata .zarr_format = format_version
488488
489489 def sniff (self , filename : str ) -> bool :
490+ """
491+ >>> from galaxy.datatypes.sniff import get_test_fname
492+ >>> fname = get_test_fname('Images.zarr.zip')
493+ >>> CompressedZarrZipArchive().sniff(fname)
494+ True
495+ """
490496 # Check if the zip file contains a zarr store.
491497 # In theory, the zarr store must be in the root of the zip file.
492498 # See: https://github.com/zarr-developers/zarr-python/issues/756#issuecomment-852134901
@@ -4856,6 +4862,322 @@ def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> N
48564862 dataset .metadata .version = struct .unpack ("<i" , header_bytes [4 :8 ])[0 ]
48574863
48584864
4865+ class SpatialData (CompressedZarrZipArchive ):
4866+ """
4867+ Class for SpatialData file: https://spatialdata.scverse.org/
4868+
4869+ SpatialData: an open and universal framework for processing spatial omics data.
4870+ SpatialData aims at implementing a performant in-memory representation in Python
4871+ and an on-disk representation based on the Zarr and Parquet data formats
4872+ and following, when applicable, the OME-NGFF specification
4873+
4874+ The format stores multi-modal spatial omics datasets including:
4875+ - Images (2D/3D multi-scale)
4876+ - Labels (segmentation masks)
4877+ - Shapes (polygons, circles)
4878+ - Points (transcript locations, point clouds)
4879+ - Tables (annotations)
4880+ """
4881+
4882+ file_ext = "spatialdata.zip"
4883+
4884+ # Minimal metadata for elements
4885+ MetadataElement (
4886+ name = "images_count" ,
4887+ desc = "Number of SpatialData image elements" ,
4888+ default = 0 ,
4889+ readonly = True ,
4890+ visible = True ,
4891+ no_value = 0 ,
4892+ )
4893+
4894+ MetadataElement (
4895+ name = "labels_count" ,
4896+ desc = "Number of SpatialData label elements" ,
4897+ default = 0 ,
4898+ readonly = True ,
4899+ visible = True ,
4900+ no_value = 0 ,
4901+ )
4902+
4903+ MetadataElement (
4904+ name = "shapes_count" ,
4905+ desc = "Number of SpatialData shape elements" ,
4906+ default = 0 ,
4907+ readonly = True ,
4908+ visible = True ,
4909+ no_value = 0 ,
4910+ )
4911+
4912+ MetadataElement (
4913+ name = "points_count" ,
4914+ desc = "Number of SpatialData point elements" ,
4915+ default = 0 ,
4916+ readonly = True ,
4917+ visible = True ,
4918+ no_value = 0 ,
4919+ )
4920+
4921+ MetadataElement (
4922+ name = "tables" ,
4923+ desc = "SpatialData table elements" ,
4924+ default = [],
4925+ param = metadata .SelectParameter ,
4926+ multiple = True ,
4927+ readonly = True ,
4928+ visible = True ,
4929+ )
4930+
4931+ MetadataElement (
4932+ name = "table_shapes" ,
4933+ desc = "SpatialData table shapes (n_obs, n_vars)" ,
4934+ default = {},
4935+ param = metadata .DictParameter ,
4936+ readonly = True ,
4937+ visible = False ,
4938+ )
4939+
4940+ MetadataElement (
4941+ name = "coordinate_systems" ,
4942+ desc = "SpatialData coordinate systems" ,
4943+ default = [],
4944+ param = metadata .SelectParameter ,
4945+ multiple = True ,
4946+ readonly = True ,
4947+ visible = True ,
4948+ )
4949+
4950+ MetadataElement (
4951+ name = "spatialdata_version" ,
4952+ desc = "SpatialData software version" ,
4953+ default = "" ,
4954+ readonly = True ,
4955+ visible = True ,
4956+ no_value = "" ,
4957+ )
4958+
4959+ def set_peek (self , dataset : DatasetProtocol , ** kwd ) -> None :
4960+ if not dataset .dataset .purged :
4961+ # Try to make a metadata like spatialdata file itself
4962+ peek_lines = ["SpatialData object" ]
4963+
4964+ # Show zarr format if available
4965+ if dataset .metadata .zarr_format :
4966+ peek_lines [0 ] += f" (Zarr Format v{ dataset .metadata .zarr_format } )"
4967+
4968+ # Show counts for each element type
4969+ if getattr (dataset .metadata , "images_count" , 0 ):
4970+ peek_lines .append (f"├── Images ({ dataset .metadata .images_count } )" )
4971+
4972+ if getattr (dataset .metadata , "labels_count" , 0 ):
4973+ peek_lines .append (f"├── Labels ({ dataset .metadata .labels_count } )" )
4974+
4975+ if getattr (dataset .metadata , "shapes_count" , 0 ):
4976+ peek_lines .append (f"├── Shapes ({ dataset .metadata .shapes_count } )" )
4977+
4978+ if getattr (dataset .metadata , "points_count" , 0 ):
4979+ peek_lines .append (f"├── Points ({ dataset .metadata .points_count } )" )
4980+
4981+ if dataset .metadata .tables :
4982+ peek_lines .append (f"└── Tables ({ len (dataset .metadata .tables )} )" )
4983+ for tbl in dataset .metadata .tables :
4984+ # Add shape information if available
4985+ if dataset .metadata .table_shapes and tbl in dataset .metadata .table_shapes :
4986+ shape = dataset .metadata .table_shapes [tbl ]
4987+ peek_lines .append (f" └── '{ tbl } ': AnnData { shape } " )
4988+ else :
4989+ peek_lines .append (f" └── '{ tbl } '" )
4990+
4991+ # Show coordinate systems if available
4992+ if dataset .metadata .coordinate_systems :
4993+ peek_lines .append ("" )
4994+ peek_lines .append ("with coordinate systems:" )
4995+ for cs in dataset .metadata .coordinate_systems :
4996+ peek_lines .append (f" • { cs } " )
4997+
4998+ dataset .peek = "\n " .join (peek_lines )
4999+ dataset .blurb = f"SpatialData file ({ nice_size (dataset .get_size ())} )"
5000+ if dataset .metadata .spatialdata_version :
5001+ dataset .blurb += f"\n Version: { dataset .metadata .spatialdata_version } "
5002+ else :
5003+ dataset .peek = "file does not exist"
5004+ dataset .blurb = "file purged from disk"
5005+
5006+ def set_meta (self , dataset : DatasetProtocol , overwrite : bool = True , ** kwd ) -> None :
5007+ super ().set_meta (dataset , overwrite = overwrite , ** kwd )
5008+ try :
5009+ with zipfile .ZipFile (dataset .get_file_name ()) as zf :
5010+ # Initialize element dictionaries to track elements by type
5011+ images = set ()
5012+ labels = set ()
5013+ shapes = set ()
5014+ points = set ()
5015+ tables = set ()
5016+ coordinate_systems = set ()
5017+ spatialdata_version = ""
5018+
5019+ # Find the root zarr directory
5020+ root_zarr = None
5021+ for file in zf .namelist ():
5022+ if file .endswith (".zarr/.zattrs" ):
5023+ root_zarr = file .replace ("/.zattrs" , "" )
5024+ break
5025+
5026+ # Read root attributes for version info
5027+ if root_zarr :
5028+ root_attrs_path = f"{ root_zarr } /.zattrs"
5029+ try :
5030+ with zf .open (root_attrs_path ) as f :
5031+ root_attrs = json .load (f )
5032+ if "spatialdata_attrs" in root_attrs :
5033+ spatialdata_attrs = root_attrs ["spatialdata_attrs" ]
5034+ spatialdata_version = spatialdata_attrs .get ("spatialdata_software_version" , "" )
5035+ except Exception :
5036+ pass
5037+
5038+ # Parse all files to extract elements and coordinate systems
5039+ for file in zf .namelist ():
5040+ # Extract elements based on directory structure
5041+ # Expected structure: <root>.zarr/<element_type>/<element_name>/...
5042+ if root_zarr and file .startswith (root_zarr + "/" ):
5043+ rel_parts = file [len (root_zarr ) + 1 :].split ("/" )
5044+ if len (rel_parts ) >= 2 :
5045+ element_type = rel_parts [0 ]
5046+ element_name = rel_parts [1 ]
5047+
5048+ # Skip metadata files and empty names
5049+ if element_name and not element_name .startswith ("." ):
5050+ if element_type == "images" :
5051+ images .add (element_name )
5052+ elif element_type == "labels" :
5053+ labels .add (element_name )
5054+ elif element_type == "shapes" :
5055+ shapes .add (element_name )
5056+ elif element_type == "points" :
5057+ points .add (element_name )
5058+ elif element_type == "tables" :
5059+ tables .add (element_name )
5060+
5061+ # Extract coordinate system information from .zattrs files
5062+ if file .endswith (".zattrs" ):
5063+ try :
5064+ with zf .open (file ) as f :
5065+ attrs = json .load (f )
5066+
5067+ # Check for coordinate transformations
5068+ if "coordinateTransformations" in attrs :
5069+ transforms = attrs ["coordinateTransformations" ]
5070+ if isinstance (transforms , list ):
5071+ for transform in transforms :
5072+ if isinstance (transform , dict ) and "output" in transform :
5073+ output = transform ["output" ]
5074+ if isinstance (output , dict ) and "name" in output :
5075+ coordinate_systems .add (output ["name" ])
5076+ elif isinstance (output , str ):
5077+ coordinate_systems .add (output )
5078+
5079+ # Check for multiscales (images/labels)
5080+ if "multiscales" in attrs :
5081+ multiscales = attrs ["multiscales" ]
5082+ if isinstance (multiscales , list ):
5083+ for ms in multiscales :
5084+ if isinstance (ms , dict ) and "coordinateTransformations" in ms :
5085+ for ct in ms ["coordinateTransformations" ]:
5086+ if isinstance (ct , dict ) and "output" in ct :
5087+ output = ct ["output" ]
5088+ if isinstance (output , dict ) and "name" in output :
5089+ coordinate_systems .add (output ["name" ])
5090+ elif isinstance (output , str ):
5091+ coordinate_systems .add (output )
5092+
5093+ # Check for spatialdata transform attribute (legacy)
5094+ if "transform" in attrs :
5095+ transform_dict = attrs ["transform" ]
5096+ if isinstance (transform_dict , dict ):
5097+ coordinate_systems .update (transform_dict .keys ())
5098+ except Exception :
5099+ pass
5100+
5101+ # Set metadata: counts for most elements, but keep tables and
5102+ # coordinate system names and table shapes for compatibility.
5103+ dataset .metadata .images_count = len (images )
5104+ dataset .metadata .labels_count = len (labels )
5105+ dataset .metadata .shapes_count = len (shapes )
5106+ dataset .metadata .points_count = len (points )
5107+
5108+ # Preserve table names and shapes (as before)
5109+ dataset .metadata .tables = sorted (tables )
5110+ table_shapes = {}
5111+ for table_name in tables :
5112+ try :
5113+ obs_index_path = f"{ root_zarr } /tables/{ table_name } /obs/_index/.zarray"
5114+ var_index_path = f"{ root_zarr } /tables/{ table_name } /var/_index/.zarray"
5115+ n_obs = None
5116+ n_vars = None
5117+ if obs_index_path in zf .namelist ():
5118+ with zf .open (obs_index_path ) as f :
5119+ obs_array = json .load (f )
5120+ n_obs = obs_array .get ("shape" , [None ])[0 ]
5121+ if var_index_path in zf .namelist ():
5122+ with zf .open (var_index_path ) as f :
5123+ var_array = json .load (f )
5124+ n_vars = var_array .get ("shape" , [None ])[0 ]
5125+ if n_obs is not None and n_vars is not None :
5126+ table_shapes [table_name ] = (n_obs , n_vars )
5127+ except Exception :
5128+ pass
5129+ dataset .metadata .table_shapes = table_shapes
5130+ dataset .metadata .coordinate_systems = sorted (coordinate_systems )
5131+ dataset .metadata .spatialdata_version = spatialdata_version
5132+ except Exception :
5133+ pass
5134+
5135+ def sniff (self , filename : str ) -> bool :
5136+ """
5137+ Check if the file is a valid SpatialData zarr archive.
5138+
5139+ SpatialData files are Zarr archives with specific structure containing
5140+ a root .zattrs file with spatialdata_attrs metadata and element directories
5141+ like images/, labels/, shapes/, points/, or tables/.
5142+
5143+ >>> from galaxy.datatypes.sniff import get_test_fname
5144+ >>> fname = get_test_fname('subsampled_visium.spatialdata.zip')
5145+ >>> SpatialData().sniff(fname)
5146+ True
5147+ >>> fname = get_test_fname('Images.zarr.zip')
5148+ >>> SpatialData().sniff(fname)
5149+ False
5150+ """
5151+
5152+ try :
5153+ with zipfile .ZipFile (filename ) as zf :
5154+ # First, check if this is a zarr archive at all
5155+ if not super ().sniff (filename ):
5156+ return False
5157+
5158+ # Look for the root .zattrs file with spatialdata_attrs.
5159+ # This can distinguish spatialdata from other zarr archives.
5160+ for file in zf .namelist ():
5161+ # Look for .zattrs file at the root of the zarr store
5162+ # The zarr store can be at root or one level deeper
5163+ parts = file .split ("/" )
5164+ # Root level: .zattrs or one level deep: <name>.zarr/.zattrs
5165+ if file == ".zattrs" or (len (parts ) == 2 and parts [0 ].endswith (".zarr" ) and parts [1 ] == ".zattrs" ):
5166+ try :
5167+ with zf .open (file ) as f :
5168+ attrs = json .load (f )
5169+ # Check for SpatialData-specific metadata
5170+ if "spatialdata_attrs" in attrs :
5171+ return True
5172+ except Exception :
5173+ pass
5174+
5175+ return False
5176+ except Exception :
5177+ # Any exception during parsing means it's not a valid spatialdata file
5178+ return False
5179+
5180+
48595181@build_sniff_from_prefix
48605182class Safetensors (Binary ):
48615183 """
0 commit comments