Skip to content

Commit ed2ab16

Browse files
committed
update backend
1 parent d5fbdf4 commit ed2ab16

10 files changed

Lines changed: 172 additions & 200 deletions

File tree

src/fairchem/applications/fastcsp/core/utils/configuration.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -165,9 +165,9 @@ def _validate_tolerance_params(params: dict[str, Any], param_set_name: str) -> N
165165
"""Validate tolerance parameters are positive numbers."""
166166
tolerance_params = ["ltol", "stol", "angle_tol"]
167167
for param in tolerance_params:
168-
if (param in params and not isinstance(params[param], (int, float))) or params[
169-
param
170-
] <= 0:
168+
if param in params and (
169+
not isinstance(params[param], (int, float)) or params[param] <= 0
170+
):
171171
raise ValueError(
172172
f"'{param}' in '{param_set_name}' must be a positive number"
173173
)

src/fairchem/applications/fastcsp/core/utils/deduplicate.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424
def process_structure_group(group_data, ltol=0.2, stol=0.3, angle_tol=5):
2525
"""
2626
Apply crystallographic deduplication within a pre-filtered structure group.
27+
28+
Args:
29+
group_data: Tuple of (indices, structures) for this group
30+
matcher_kwargs: Dict of kwargs for pymatgen StructureMatcher
2731
"""
2832
indices, structures = group_data
2933

@@ -85,10 +89,10 @@ def deduplicate_structures(
8589
logger = get_central_logger()
8690

8791
# Stage 1: Generate hash-based groups for pre-filtering
88-
logger.info("Generating structure hashes for pre-filtering...")
89-
logger.info(f"Hashing settings - Density: {hash_density}, Volume: {hash_volume}")
90-
logger.info(f"Total structures to process: {len(structures_df)}")
91-
logger.info(f"Structure DataFrame head:\n{structures_df.head()}")
92+
logger.debug("Generating structure hashes for pre-filtering...")
93+
logger.debug(f"Hashing settings - Density: {hash_density}, Volume: {hash_volume}")
94+
logger.debug(f"Total structures to process: {len(structures_df)}")
95+
logger.debug(f"Structure DataFrame head:\n{structures_df.head()}")
9296
hashes = structures_df[["structure", "z"]].apply(
9397
lambda x: get_structure_hash(
9498
x["structure"],

src/fairchem/applications/fastcsp/core/utils/logging.py

Lines changed: 48 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@
1414
import sys
1515
from datetime import datetime
1616
from pathlib import Path
17-
from typing import Any
17+
from typing import TYPE_CHECKING, Any
18+
19+
if TYPE_CHECKING:
20+
from fairchem.applications.fastcsp.core.utils.configs import LoggingConfig
1821

1922

2023
def setup_fastcsp_logger(
@@ -23,16 +26,30 @@ def setup_fastcsp_logger(
2326
level: str = "INFO",
2427
console_output: bool = True,
2528
append: bool = True,
29+
format_str: str | None = None,
2630
) -> logging.Logger:
27-
"""Set up the centralized FastCSP logger."""
31+
"""
32+
Set up the centralized FastCSP logger.
33+
34+
Args:
35+
name: Logger name (default: "fastcsp").
36+
log_file: Path to log file (default: None = no file logging).
37+
level: Logging level (default: "INFO").
38+
console_output: Whether to output to console (default: True).
39+
append: Whether to append to existing log file (default: True).
40+
format_str: Log message format string (default: standard format).
41+
42+
Returns:
43+
Configured logging.Logger instance.
44+
"""
2845
logger = logging.getLogger(name)
2946
logger.setLevel(getattr(logging, level.upper()))
3047
logger.handlers.clear()
3148

32-
formatter = logging.Formatter(
33-
"%(asctime)s - %(name)s - %(levelname)s - %(message)s",
34-
datefmt="%Y-%m-%d %H:%M:%S",
35-
)
49+
if format_str is None:
50+
format_str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
51+
52+
formatter = logging.Formatter(format_str, datefmt="%Y-%m-%d %H:%M:%S")
3653

3754
if console_output:
3855
console_handler = logging.StreamHandler(sys.stdout)
@@ -49,16 +66,40 @@ def setup_fastcsp_logger(
4966
return logger
5067

5168

69+
def setup_fastcsp_logger_from_config(
70+
config: LoggingConfig, name: str = "fastcsp"
71+
) -> logging.Logger:
72+
"""
73+
Set up the centralized FastCSP logger from a LoggingConfig dataclass.
74+
75+
Args:
76+
config: LoggingConfig instance with logging parameters.
77+
name: Logger name (default: "fastcsp").
78+
79+
Returns:
80+
Configured logging.Logger instance.
81+
"""
82+
return setup_fastcsp_logger(
83+
name=name,
84+
log_file=config.log_file,
85+
level=config.level,
86+
console_output=config.console_output,
87+
append=config.append,
88+
format_str=config.format,
89+
)
90+
91+
5292
def ensure_all_modules_use_central_logger() -> None:
5393
"""Configure all FastCSP modules to use the central logger."""
5494
central_logger = logging.getLogger("fastcsp")
5595

5696
# All module patterns to redirect
97+
# Note: submitit is excluded because its DEBUG output contains carriage returns
98+
# for progress bars which create long garbled lines in log files
5799
module_patterns = [
58100
"fastcsp",
59101
"fairchem.applications.fastcsp",
60102
"genarris",
61-
"submitit",
62103
]
63104

64105
# Find and configure all matching modules

src/fairchem/applications/fastcsp/core/utils/structure.py

Lines changed: 12 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -54,36 +54,14 @@ def cif_to_structure(cif: str) -> Structure | None:
5454

5555
def cif_to_atoms(cif: str) -> Atoms | None:
5656
"""
57-
Convert CIF string to ASE (Atomic Simulation Environment) Atoms object.
58-
59-
This function provides a direct path from CIF format to ASE Atoms objects,
60-
which are commonly used for structure optimization and analysis.
61-
62-
Args:
63-
cif: CIF format string containing crystal structure data
64-
65-
Returns:
66-
ASE Atoms object if conversion successful, None if cif is empty/invalid
57+
Convert CIF string to ASE Atoms object.
6758
"""
6859
return AseAtomsAdaptor.get_atoms(cif_to_structure(cif)) if cif else None
6960

7061

7162
def get_partition_id(key: str, npartitions: int = 1000) -> int:
7263
"""
73-
Generate a consistent partition ID for distributed processing of structures.
74-
75-
This function creates deterministic partitioning for parallel processing,
76-
ensuring that structures with the same key always map to the same partition.
77-
78-
Args:
79-
key: String identifier for the structure (e.g., molecule_name + space_group)
80-
npartitions: Total number of partitions for distribution (default: 1000)
81-
82-
Returns:
83-
int: Partition ID in range [0, npartitions-1]
84-
85-
Notes:
86-
- Deterministic: same key always produces same partition ID
64+
Generate consistent partition ID from key using MD5 hash.
8765
"""
8866
key_encoded = key.encode("utf-8")
8967
md5_hash = hashlib.md5()
@@ -102,12 +80,7 @@ def get_structure_hash(
10280
vol_bin_size: float = 0.2,
10381
) -> str:
10482
"""
105-
Generate a hash string for crystal structure grouping and fast pre-filtering.
106-
107-
Creates a binned hash based on chemical formula and geometric properties to
108-
enable fast pre-filtering before expensive crystallographic comparisons.
109-
This approach dramatically reduces the number of structure pairs that need
110-
detailed comparison during deduplication.
83+
Generate hash string for structure grouping based on formula and binned properties.
11184
11285
Args:
11386
structure: Pymatgen Structure object to hash
@@ -118,35 +91,25 @@ def get_structure_hash(
11891
vol_bin_size: Bin size for volume discretization (Ų)
11992
12093
Returns:
121-
Hash string combining formula, Z, and optionally density/volume bins
122-
123-
Hashing Strategy:
124-
1. Start with reduced chemical formula and Z value
125-
2. Add binned density if use_density=True for packing similarity
126-
3. Add binned volume if use_volume=True for volume grouping
127-
4. Combine components for readable hash
128-
129-
Example:
130-
>>> get_structure_hash(structure, z=4, use_density=True)
131-
"C6H4O4_4_1.5_125.2" # Formula_Z_density_volume
94+
Hash string like "C6H4O4_z4_d1.5_v125.2".
13295
"""
133-
# Start with chemical composition and stoichiometry
96+
# Start with chemical composition
13497
formula = structure.composition.reduced_formula
135-
hash_components = [formula, str(z)]
98+
hash_components = [formula, "z" + str(z)]
13699

137100
# Add density-based grouping if requested
138101
if use_density:
139102
density = structure.density
140103
# Bin density to group structures with similar packing
141104
density_bin = round(density / density_bin_size) * density_bin_size
142-
hash_components.append(f"{density_bin:.1f}")
105+
hash_components.append(f"d{density_bin:.1f}")
143106

144107
# Add volume-based grouping if requested
145108
if use_volume:
146109
volume = structure.volume
147110
# Bin volume to group structures with similar cell sizes
148111
vol_bin = round(volume / vol_bin_size**3) * vol_bin_size**3
149-
hash_components.append(f"{vol_bin:.1f}")
112+
hash_components.append(f"v{vol_bin:.1f}")
150113

151114
# Combine all components into single hash string
152115
return "_".join(hash_components)
@@ -156,32 +119,14 @@ def check_no_changes_in_covalent_matrix(
156119
initial_atoms: Atoms, final_atoms: Atoms
157120
) -> bool:
158121
"""
159-
Validate that covalent bonding network is preserved during structure relaxation.
160-
161-
Compares the covalent bonding adjacency matrices before and after ML-based
162-
relaxation to detect unwanted chemical reconstructions. This validation ensures
163-
that the relaxation process only optimizes geometry without breaking or forming
164-
chemical bonds, which would indicate problematic initial structures or
165-
relaxation failures.
122+
Check if covalent bonding network is preserved after relaxation.
166123
167124
Args:
168-
initial_atoms: Original structure before relaxation
169-
final_atoms: Structure after ML-based relaxation
125+
initial_atoms: Structure before relaxation.
126+
final_atoms: Structure after relaxation.
170127
171128
Returns:
172-
True if bonding network is preserved, False otherwise
173-
Returns False if either structure is None (error handling)
174-
175-
Algorithm:
176-
1. Convert ASE Atoms to pymatgen Structures for analysis
177-
2. Use JmolNN to identify covalent neighbors in both structures
178-
3. Build adjacency matrices representing bonding networks
179-
4. Compare matrices for exact equality
180-
181-
Validation Purpose:
182-
- Detect atom overlaps that lead to artificial bonding
183-
- Identify relaxation artifacts that break molecular integrity
184-
- Filter out reconstructions that change chemical connectivity
129+
True if bonding network unchanged, False otherwise.
185130
"""
186131
# Handle error cases where structures couldn't be processed
187132
if initial_atoms is None or final_atoms is None:

src/fairchem/applications/fastcsp/core/workflow/eval.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,6 @@ def _load_csd_structure_subprocess(cif_path: Path, csd_python_cmd: str):
128128
try:
129129
cif_content = cif_path.read_text()
130130
crystal = Crystal.from_string(cif_content, "cif")
131-
print(str(crystal))
132131
except Exception as e:
133132
print(f"Error loading structure: {{e}}", file=sys.stderr)
134133
sys.exit(1)
@@ -163,7 +162,7 @@ def _match_csd(row, target_xtals, shell_size=30):
163162
raise ImportError("CSD Python API required for CCDC matching.") from e
164163

165164
try:
166-
gen_xtal = Crystal.from_string(row.relaxed_cif, "cif")
165+
gen_xtal = Crystal.from_string(row.cif_relaxed, "cif")
167166
except Exception as e:
168167
logger.error(f"Error parsing CSD structure {row.structure_id}: {e}")
169168
return None, None
@@ -226,7 +225,7 @@ def ccdc_match_settings(shell_size=30, ignore_H=True, mol_diff=False):
226225
# Replicate _match_csd function locally
227226
def match_csd_local(row, target_xtals, shell_size=30):
228227
try:
229-
gen_xtal = Crystal.from_string(row.relaxed_cif, "cif")
228+
gen_xtal = Crystal.from_string(row.cif_relaxed, "cif")
230229
except Exception as e:
231230
return None, None
232231
@@ -250,7 +249,7 @@ def match_csd_local(row, target_xtals, shell_size=30):
250249
return best_match_refcode, best_rmsd
251250
252251
# Simple objects to match function interface
253-
Row = namedtuple("Row", ["structure_id", "relaxed_cif"])
252+
Row = namedtuple("Row", ["structure_id", "cif_relaxed"])
254253
255254
# Parse input and reconstruct target structures
256255
data = json.loads(sys.stdin.read())
@@ -272,7 +271,7 @@ def match_csd_local(row, target_xtals, shell_size=30):
272271
"""
273272

274273
input_data = {
275-
"structure_cif": row.relaxed_cif,
274+
"structure_cif": row.cif_relaxed,
276275
"target_cifs": target_cifs,
277276
"shell_size": shell_size,
278277
"structure_id": row.structure_id,
@@ -361,7 +360,7 @@ def match_structures(row, target_structures, eval_method="csd", **kwargs):
361360
pymatgen StructureMatcher.
362361
363362
Args:
364-
row: DataFrame row containing structure data with 'relaxed_cif' column
363+
row: DataFrame row containing structure data with 'cif_relaxed' column
365364
target_structures: Dictionary mapping reference codes to target structures
366365
(CCDC Crystal objects for CSD, pymatgen Structure for pymatgen)
367366
method: Evaluation method ('csd' or 'pymatgen')
@@ -445,9 +444,7 @@ def load_target_structures(
445444
continue
446445

447446
try:
448-
structure = _load_single_structure(
449-
cif_path, eval_method, **kwargs
450-
)
447+
structure = _load_single_structure(cif_path, eval_method, **kwargs)
451448
target_structures[refcode] = structure
452449
logger.debug(f"Loaded structure for {refcode} from {cif_path}")
453450
except Exception as e:
@@ -470,7 +467,9 @@ def load_target_structures(
470467
continue
471468

472469
try:
473-
structure = _load_single_structure(cif_file, eval_method, **kwargs)
470+
structure = _load_single_structure(
471+
cif_file, eval_method, **kwargs
472+
)
474473
target_structures[refcode] = structure
475474
logger.debug(f"Loaded structure for {refcode} from {cif_file}")
476475
except Exception as e:

src/fairchem/applications/fastcsp/core/workflow/filter.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
from __future__ import annotations
2828

29+
import os
2930
from typing import TYPE_CHECKING, Any
3031

3132
import pandas as pd
@@ -87,7 +88,7 @@ def filter_and_deduplicate_structures_single(
8788
root_unrelaxed: Path | None = None,
8889
):
8990
"""
90-
Apply filtering to a single dataset.
91+
Apply filtering and deduplication to a single parquet dataset.
9192
9293
Args:
9394
input_filename: Path to input parquet file with structure data
@@ -131,15 +132,17 @@ def filter_and_deduplicate_structures_single(
131132
)
132133

133134
# Convert CIF strings to atomic structures for connectivity analysis
134-
final_atoms = structures_df["relaxed_cif"].apply(cif_to_atoms)
135+
final_atoms = structures_df["cif_relaxed"].apply(cif_to_atoms)
135136
initial_atoms = structures_df["cif"].apply(cif_to_atoms)
136137

137138
# Validate bonding network preservation during relaxation
138-
structures_df["connectivity_unchanged"] = p_map(
139+
140+
num_cpus = max(len(os.sched_getaffinity(0)), 1)
141+
structures_df["validity.connectivity_unchanged"] = p_map(
139142
check_no_changes_in_covalent_matrix,
140143
initial_atoms,
141144
final_atoms,
142-
num_cpus=70, # Parallel processing for connectivity validation
145+
num_cpus=num_cpus,
143146
)
144147

145148
# Save intermediate results with connectivity validation flags
@@ -180,7 +183,7 @@ def filter_and_deduplicate_structures_single(
180183
logger.info(f"After filtering by energy: {structures_df_filtered.shape}")
181184

182185
# Convert CIF strings to pymatgen Structures for deduplication
183-
structures_df_filtered["structure"] = structures_df_filtered["relaxed_cif"].apply(
186+
structures_df_filtered["structure"] = structures_df_filtered["cif_relaxed"].apply(
184187
cif_to_structure
185188
)
186189

@@ -232,7 +235,7 @@ def filter_and_deduplicate_structures(
232235
root_unrelaxed: Path | None = None,
233236
):
234237
"""
235-
Orchestrate parallel filtering and deduplication across multiple structure datasets.
238+
Submit parallel filtering jobs for multiple datasets.
236239
237240
Args:
238241
input_dir: Root directory containing multiple dataset directories
@@ -249,7 +252,7 @@ def filter_and_deduplicate_structures(
249252
root_unrelaxed: Root directory with unrelaxed structures
250253
251254
Returns:
252-
List of submitit job objects for monitoring progress
255+
List of submitit job objects.
253256
"""
254257
logger = get_central_logger()
255258

0 commit comments

Comments
 (0)