@@ -54,36 +54,14 @@ def cif_to_structure(cif: str) -> Structure | None:
5454
5555def cif_to_atoms (cif : str ) -> Atoms | None :
5656 """
57- Convert CIF string to ASE (Atomic Simulation Environment) Atoms object.
58-
59- This function provides a direct path from CIF format to ASE Atoms objects,
60- which are commonly used for structure optimization and analysis.
61-
62- Args:
63- cif: CIF format string containing crystal structure data
64-
65- Returns:
66- ASE Atoms object if conversion successful, None if cif is empty/invalid
57+ Convert CIF string to ASE Atoms object.
6758 """
6859 return AseAtomsAdaptor .get_atoms (cif_to_structure (cif )) if cif else None
6960
7061
7162def get_partition_id (key : str , npartitions : int = 1000 ) -> int :
7263 """
73- Generate a consistent partition ID for distributed processing of structures.
74-
75- This function creates deterministic partitioning for parallel processing,
76- ensuring that structures with the same key always map to the same partition.
77-
78- Args:
79- key: String identifier for the structure (e.g., molecule_name + space_group)
80- npartitions: Total number of partitions for distribution (default: 1000)
81-
82- Returns:
83- int: Partition ID in range [0, npartitions-1]
84-
85- Notes:
86- - Deterministic: same key always produces same partition ID
64+ Generate consistent partition ID from key using MD5 hash.
8765 """
8866 key_encoded = key .encode ("utf-8" )
8967 md5_hash = hashlib .md5 ()
@@ -102,12 +80,7 @@ def get_structure_hash(
10280 vol_bin_size : float = 0.2 ,
10381) -> str :
10482 """
105- Generate a hash string for crystal structure grouping and fast pre-filtering.
106-
107- Creates a binned hash based on chemical formula and geometric properties to
108- enable fast pre-filtering before expensive crystallographic comparisons.
109- This approach dramatically reduces the number of structure pairs that need
110- detailed comparison during deduplication.
83+ Generate hash string for structure grouping based on formula and binned properties.
11184
11285 Args:
11386 structure: Pymatgen Structure object to hash
@@ -118,35 +91,25 @@ def get_structure_hash(
11891 vol_bin_size: Bin size for volume discretization (Ų)
11992
12093 Returns:
121- Hash string combining formula, Z, and optionally density/volume bins
122-
123- Hashing Strategy:
124- 1. Start with reduced chemical formula and Z value
125- 2. Add binned density if use_density=True for packing similarity
126- 3. Add binned volume if use_volume=True for volume grouping
127- 4. Combine components for readable hash
128-
129- Example:
130- >>> get_structure_hash(structure, z=4, use_density=True)
131- "C6H4O4_4_1.5_125.2" # Formula_Z_density_volume
94+ Hash string like "C6H4O4_z4_d1.5_v125.2".
13295 """
133- # Start with chemical composition and stoichiometry
96+ # Start with chemical composition
13497 formula = structure .composition .reduced_formula
135- hash_components = [formula , str (z )]
98+ hash_components = [formula , "z" + str (z )]
13699
137100 # Add density-based grouping if requested
138101 if use_density :
139102 density = structure .density
140103 # Bin density to group structures with similar packing
141104 density_bin = round (density / density_bin_size ) * density_bin_size
142- hash_components .append (f"{ density_bin :.1f} " )
105+ hash_components .append (f"d { density_bin :.1f} " )
143106
144107 # Add volume-based grouping if requested
145108 if use_volume :
146109 volume = structure .volume
147110 # Bin volume to group structures with similar cell sizes
148111 vol_bin = round (volume / vol_bin_size ** 3 ) * vol_bin_size ** 3
149- hash_components .append (f"{ vol_bin :.1f} " )
112+ hash_components .append (f"v { vol_bin :.1f} " )
150113
151114 # Combine all components into single hash string
152115 return "_" .join (hash_components )
@@ -156,32 +119,14 @@ def check_no_changes_in_covalent_matrix(
156119 initial_atoms : Atoms , final_atoms : Atoms
157120) -> bool :
158121 """
159- Validate that covalent bonding network is preserved during structure relaxation.
160-
161- Compares the covalent bonding adjacency matrices before and after ML-based
162- relaxation to detect unwanted chemical reconstructions. This validation ensures
163- that the relaxation process only optimizes geometry without breaking or forming
164- chemical bonds, which would indicate problematic initial structures or
165- relaxation failures.
122+ Check if covalent bonding network is preserved after relaxation.
166123
167124 Args:
168- initial_atoms: Original structure before relaxation
169- final_atoms: Structure after ML-based relaxation
125+ initial_atoms: Structure before relaxation.
126+ final_atoms: Structure after relaxation.
170127
171128 Returns:
172- True if bonding network is preserved, False otherwise
173- Returns False if either structure is None (error handling)
174-
175- Algorithm:
176- 1. Convert ASE Atoms to pymatgen Structures for analysis
177- 2. Use JmolNN to identify covalent neighbors in both structures
178- 3. Build adjacency matrices representing bonding networks
179- 4. Compare matrices for exact equality
180-
181- Validation Purpose:
182- - Detect atom overlaps that lead to artificial bonding
183- - Identify relaxation artifacts that break molecular integrity
184- - Filter out reconstructions that change chemical connectivity
129+ True if bonding network unchanged, False otherwise.
185130 """
186131 # Handle error cases where structures couldn't be processed
187132 if initial_atoms is None or final_atoms is None :
0 commit comments