facebookresearch
diff --git a/‎src/fairchem/applications/fastcsp/core/utils/configuration.py‎
Lines changed: 3 additions & 3 deletions b/‎src/fairchem/applications/fastcsp/core/utils/configuration.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/fairchem/applications/fastcsp/core/utils/deduplicate.py‎
Lines changed: 8 additions & 4 deletions b/‎src/fairchem/applications/fastcsp/core/utils/deduplicate.py‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎src/fairchem/applications/fastcsp/core/utils/logging.py‎
Lines changed: 48 additions & 7 deletions b/‎src/fairchem/applications/fastcsp/core/utils/logging.py‎
Lines changed: 48 additions & 7 deletions
diff --git a/‎src/fairchem/applications/fastcsp/core/utils/structure.py‎
Lines changed: 12 additions & 67 deletions b/‎src/fairchem/applications/fastcsp/core/utils/structure.py‎
Lines changed: 12 additions & 67 deletions
diff --git a/‎src/fairchem/applications/fastcsp/core/workflow/eval.py‎
Lines changed: 9 additions & 10 deletions b/‎src/fairchem/applications/fastcsp/core/workflow/eval.py‎
Lines changed: 9 additions & 10 deletions
diff --git a/‎src/fairchem/applications/fastcsp/core/workflow/filter.py‎
Lines changed: 10 additions & 7 deletions b/‎src/fairchem/applications/fastcsp/core/workflow/filter.py‎
Lines changed: 10 additions & 7 deletions
@@ -165,9 +165,9 @@ def _validate_tolerance_params(params: dict[str, Any], param_set_name: str) -> N
     """Validate tolerance parameters are positive numbers."""
     tolerance_params = ["ltol", "stol", "angle_tol"]
     for param in tolerance_params:
-        if (param in params and not isinstance(params[param], (int, float))) or params[
-            param
-        ] <= 0:
+        if param in params and (
+            not isinstance(params[param], (int, float)) or params[param] <= 0
+        ):
             raise ValueError(
                 f"'{param}' in '{param_set_name}' must be a positive number"
             )
 
@@ -24,6 +24,10 @@
 def process_structure_group(group_data, ltol=0.2, stol=0.3, angle_tol=5):
     """
     Apply crystallographic deduplication within a pre-filtered structure group.
+
+    Args:
+        group_data: Tuple of (indices, structures) for this group
+        matcher_kwargs: Dict of kwargs for pymatgen StructureMatcher
     """
     indices, structures = group_data
 
@@ -85,10 +89,10 @@ def deduplicate_structures(
     logger = get_central_logger()
 
     # Stage 1: Generate hash-based groups for pre-filtering
-    logger.info("Generating structure hashes for pre-filtering...")
-    logger.info(f"Hashing settings - Density: {hash_density}, Volume: {hash_volume}")
-    logger.info(f"Total structures to process: {len(structures_df)}")
-    logger.info(f"Structure DataFrame head:\n{structures_df.head()}")
+    logger.debug("Generating structure hashes for pre-filtering...")
+    logger.debug(f"Hashing settings - Density: {hash_density}, Volume: {hash_volume}")
+    logger.debug(f"Total structures to process: {len(structures_df)}")
+    logger.debug(f"Structure DataFrame head:\n{structures_df.head()}")
     hashes = structures_df[["structure", "z"]].apply(
         lambda x: get_structure_hash(
             x["structure"],
 
@@ -14,7 +14,10 @@
 import sys
 from datetime import datetime
 from pathlib import Path
-from typing import Any
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    from fairchem.applications.fastcsp.core.utils.configs import LoggingConfig
 
 
 def setup_fastcsp_logger(
@@ -23,16 +26,30 @@ def setup_fastcsp_logger(
     level: str = "INFO",
     console_output: bool = True,
     append: bool = True,
+    format_str: str | None = None,
 ) -> logging.Logger:
-    """Set up the centralized FastCSP logger."""
+    """
+    Set up the centralized FastCSP logger.
+
+    Args:
+        name: Logger name (default: "fastcsp").
+        log_file: Path to log file (default: None = no file logging).
+        level: Logging level (default: "INFO").
+        console_output: Whether to output to console (default: True).
+        append: Whether to append to existing log file (default: True).
+        format_str: Log message format string (default: standard format).
+
+    Returns:
+        Configured logging.Logger instance.
+    """
     logger = logging.getLogger(name)
     logger.setLevel(getattr(logging, level.upper()))
     logger.handlers.clear()
 
-    formatter = logging.Formatter(
-        "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
-        datefmt="%Y-%m-%d %H:%M:%S",
-    )
+    if format_str is None:
+        format_str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+
+    formatter = logging.Formatter(format_str, datefmt="%Y-%m-%d %H:%M:%S")
 
     if console_output:
         console_handler = logging.StreamHandler(sys.stdout)
@@ -49,16 +66,40 @@ def setup_fastcsp_logger(
     return logger
 
 
+def setup_fastcsp_logger_from_config(
+    config: LoggingConfig, name: str = "fastcsp"
+) -> logging.Logger:
+    """
+    Set up the centralized FastCSP logger from a LoggingConfig dataclass.
+
+    Args:
+        config: LoggingConfig instance with logging parameters.
+        name: Logger name (default: "fastcsp").
+
+    Returns:
+        Configured logging.Logger instance.
+    """
+    return setup_fastcsp_logger(
+        name=name,
+        log_file=config.log_file,
+        level=config.level,
+        console_output=config.console_output,
+        append=config.append,
+        format_str=config.format,
+    )
+
+
 def ensure_all_modules_use_central_logger() -> None:
     """Configure all FastCSP modules to use the central logger."""
     central_logger = logging.getLogger("fastcsp")
 
     # All module patterns to redirect
+    # Note: submitit is excluded because its DEBUG output contains carriage returns
+    # for progress bars which create long garbled lines in log files
     module_patterns = [
         "fastcsp",
         "fairchem.applications.fastcsp",
         "genarris",
-        "submitit",
     ]
 
     # Find and configure all matching modules
 
@@ -54,36 +54,14 @@ def cif_to_structure(cif: str) -> Structure | None:
 
 def cif_to_atoms(cif: str) -> Atoms | None:
     """
-    Convert CIF string to ASE (Atomic Simulation Environment) Atoms object.
-
-    This function provides a direct path from CIF format to ASE Atoms objects,
-    which are commonly used for structure optimization and analysis.
-
-    Args:
-        cif: CIF format string containing crystal structure data
-
-    Returns:
-        ASE Atoms object if conversion successful, None if cif is empty/invalid
+    Convert CIF string to ASE Atoms object.
     """
     return AseAtomsAdaptor.get_atoms(cif_to_structure(cif)) if cif else None
 
 
 def get_partition_id(key: str, npartitions: int = 1000) -> int:
     """
-    Generate a consistent partition ID for distributed processing of structures.
-
-    This function creates deterministic partitioning for parallel processing,
-    ensuring that structures with the same key always map to the same partition.
-
-    Args:
-        key: String identifier for the structure (e.g., molecule_name + space_group)
-        npartitions: Total number of partitions for distribution (default: 1000)
-
-    Returns:
-        int: Partition ID in range [0, npartitions-1]
-
-    Notes:
-        - Deterministic: same key always produces same partition ID
+    Generate consistent partition ID from key using MD5 hash.
     """
     key_encoded = key.encode("utf-8")
     md5_hash = hashlib.md5()
@@ -102,12 +80,7 @@ def get_structure_hash(
     vol_bin_size: float = 0.2,
 ) -> str:
     """
-    Generate a hash string for crystal structure grouping and fast pre-filtering.
-
-    Creates a binned hash based on chemical formula and geometric properties to
-    enable fast pre-filtering before expensive crystallographic comparisons.
-    This approach dramatically reduces the number of structure pairs that need
-    detailed comparison during deduplication.
+    Generate hash string for structure grouping based on formula and binned properties.
 
     Args:
         structure: Pymatgen Structure object to hash
@@ -118,35 +91,25 @@ def get_structure_hash(
         vol_bin_size: Bin size for volume discretization (Ų)
 
     Returns:
-        Hash string combining formula, Z, and optionally density/volume bins
-
-    Hashing Strategy:
-        1. Start with reduced chemical formula and Z value
-        2. Add binned density if use_density=True for packing similarity
-        3. Add binned volume if use_volume=True for volume grouping
-        4. Combine components for readable hash
-
-    Example:
-        >>> get_structure_hash(structure, z=4, use_density=True)
-        "C6H4O4_4_1.5_125.2"  # Formula_Z_density_volume
+        Hash string like "C6H4O4_z4_d1.5_v125.2".
     """
-    # Start with chemical composition and stoichiometry
+    # Start with chemical composition
     formula = structure.composition.reduced_formula
-    hash_components = [formula, str(z)]
+    hash_components = [formula, "z" + str(z)]
 
     # Add density-based grouping if requested
     if use_density:
         density = structure.density
         # Bin density to group structures with similar packing
         density_bin = round(density / density_bin_size) * density_bin_size
-        hash_components.append(f"{density_bin:.1f}")
+        hash_components.append(f"d{density_bin:.1f}")
 
     # Add volume-based grouping if requested
     if use_volume:
         volume = structure.volume
         # Bin volume to group structures with similar cell sizes
         vol_bin = round(volume / vol_bin_size**3) * vol_bin_size**3
-        hash_components.append(f"{vol_bin:.1f}")
+        hash_components.append(f"v{vol_bin:.1f}")
 
     # Combine all components into single hash string
     return "_".join(hash_components)
@@ -156,32 +119,14 @@ def check_no_changes_in_covalent_matrix(
     initial_atoms: Atoms, final_atoms: Atoms
 ) -> bool:
     """
-    Validate that covalent bonding network is preserved during structure relaxation.
-
-    Compares the covalent bonding adjacency matrices before and after ML-based
-    relaxation to detect unwanted chemical reconstructions. This validation ensures
-    that the relaxation process only optimizes geometry without breaking or forming
-    chemical bonds, which would indicate problematic initial structures or
-    relaxation failures.
+    Check if covalent bonding network is preserved after relaxation.
 
     Args:
-        initial_atoms: Original structure before relaxation
-        final_atoms: Structure after ML-based relaxation
+        initial_atoms: Structure before relaxation.
+        final_atoms: Structure after relaxation.
 
     Returns:
-        True if bonding network is preserved, False otherwise
-        Returns False if either structure is None (error handling)
-
-    Algorithm:
-        1. Convert ASE Atoms to pymatgen Structures for analysis
-        2. Use JmolNN to identify covalent neighbors in both structures
-        3. Build adjacency matrices representing bonding networks
-        4. Compare matrices for exact equality
-
-    Validation Purpose:
-        - Detect atom overlaps that lead to artificial bonding
-        - Identify relaxation artifacts that break molecular integrity
-        - Filter out reconstructions that change chemical connectivity
+        True if bonding network unchanged, False otherwise.
     """
     # Handle error cases where structures couldn't be processed
     if initial_atoms is None or final_atoms is None:
 
@@ -128,7 +128,6 @@ def _load_csd_structure_subprocess(cif_path: Path, csd_python_cmd: str):
 try:
     cif_content = cif_path.read_text()
     crystal = Crystal.from_string(cif_content, "cif")
-    print(str(crystal))
 except Exception as e:
     print(f"Error loading structure: {{e}}", file=sys.stderr)
     sys.exit(1)
@@ -163,7 +162,7 @@ def _match_csd(row, target_xtals, shell_size=30):
         raise ImportError("CSD Python API required for CCDC matching.") from e
 
     try:
-        gen_xtal = Crystal.from_string(row.relaxed_cif, "cif")
+        gen_xtal = Crystal.from_string(row.cif_relaxed, "cif")
     except Exception as e:
         logger.error(f"Error parsing CSD structure {row.structure_id}: {e}")
         return None, None
@@ -226,7 +225,7 @@ def ccdc_match_settings(shell_size=30, ignore_H=True, mol_diff=False):
 # Replicate _match_csd function locally
 def match_csd_local(row, target_xtals, shell_size=30):
     try:
-        gen_xtal = Crystal.from_string(row.relaxed_cif, "cif")
+        gen_xtal = Crystal.from_string(row.cif_relaxed, "cif")
     except Exception as e:
         return None, None
 
@@ -250,7 +249,7 @@ def match_csd_local(row, target_xtals, shell_size=30):
     return best_match_refcode, best_rmsd
 
 # Simple objects to match function interface
-Row = namedtuple("Row", ["structure_id", "relaxed_cif"])
+Row = namedtuple("Row", ["structure_id", "cif_relaxed"])
 
 # Parse input and reconstruct target structures
 data = json.loads(sys.stdin.read())
@@ -272,7 +271,7 @@ def match_csd_local(row, target_xtals, shell_size=30):
 """
 
     input_data = {
-        "structure_cif": row.relaxed_cif,
+        "structure_cif": row.cif_relaxed,
         "target_cifs": target_cifs,
         "shell_size": shell_size,
         "structure_id": row.structure_id,
@@ -361,7 +360,7 @@ def match_structures(row, target_structures, eval_method="csd", **kwargs):
     pymatgen StructureMatcher.
 
     Args:
-        row: DataFrame row containing structure data with 'relaxed_cif' column
+        row: DataFrame row containing structure data with 'cif_relaxed' column
         target_structures: Dictionary mapping reference codes to target structures
                           (CCDC Crystal objects for CSD, pymatgen Structure for pymatgen)
         method: Evaluation method ('csd' or 'pymatgen')
@@ -445,9 +444,7 @@ def load_target_structures(
                         continue
 
                 try:
-                    structure = _load_single_structure(
-                        cif_path, eval_method, **kwargs
-                    )
+                    structure = _load_single_structure(cif_path, eval_method, **kwargs)
                     target_structures[refcode] = structure
                     logger.debug(f"Loaded structure for {refcode} from {cif_path}")
                 except Exception as e:
@@ -470,7 +467,9 @@ def load_target_structures(
                         continue
 
                     try:
-                        structure = _load_single_structure(cif_file, eval_method, **kwargs)
+                        structure = _load_single_structure(
+                            cif_file, eval_method, **kwargs
+                        )
                         target_structures[refcode] = structure
                         logger.debug(f"Loaded structure for {refcode} from {cif_file}")
                     except Exception as e:
 
@@ -26,6 +26,7 @@
 
 from __future__ import annotations
 
+import os
 from typing import TYPE_CHECKING, Any
 
 import pandas as pd
@@ -87,7 +88,7 @@ def filter_and_deduplicate_structures_single(
     root_unrelaxed: Path | None = None,
 ):
     """
-    Apply filtering to a single dataset.
+    Apply filtering and deduplication to a single parquet dataset.
 
     Args:
         input_filename: Path to input parquet file with structure data
@@ -131,15 +132,17 @@ def filter_and_deduplicate_structures_single(
         )
 
         # Convert CIF strings to atomic structures for connectivity analysis
-        final_atoms = structures_df["relaxed_cif"].apply(cif_to_atoms)
+        final_atoms = structures_df["cif_relaxed"].apply(cif_to_atoms)
         initial_atoms = structures_df["cif"].apply(cif_to_atoms)
 
         # Validate bonding network preservation during relaxation
-        structures_df["connectivity_unchanged"] = p_map(
+
+        num_cpus = max(len(os.sched_getaffinity(0)), 1)
+        structures_df["validity.connectivity_unchanged"] = p_map(
             check_no_changes_in_covalent_matrix,
             initial_atoms,
             final_atoms,
-            num_cpus=70,  # Parallel processing for connectivity validation
+            num_cpus=num_cpus,
         )
 
         # Save intermediate results with connectivity validation flags
@@ -180,7 +183,7 @@ def filter_and_deduplicate_structures_single(
         logger.info(f"After filtering by energy: {structures_df_filtered.shape}")
 
     # Convert CIF strings to pymatgen Structures for deduplication
-    structures_df_filtered["structure"] = structures_df_filtered["relaxed_cif"].apply(
+    structures_df_filtered["structure"] = structures_df_filtered["cif_relaxed"].apply(
         cif_to_structure
     )
 
@@ -232,7 +235,7 @@ def filter_and_deduplicate_structures(
     root_unrelaxed: Path | None = None,
 ):
     """
-    Orchestrate parallel filtering and deduplication across multiple structure datasets.
+    Submit parallel filtering jobs for multiple datasets.
 
     Args:
         input_dir: Root directory containing multiple dataset directories
@@ -249,7 +252,7 @@ def filter_and_deduplicate_structures(
         root_unrelaxed: Root directory with unrelaxed structures
 
     Returns:
-        List of submitit job objects for monitoring progress
+        List of submitit job objects.
     """
     logger = get_central_logger()