@@ -79,11 +79,11 @@ def diann_convert(
7979 all_labels .update (fd ["labels" ])
8080 plex_info = detect_plexdia_type (all_labels )
8181
82- # Get enzyme ( must be consistent across experiment)
83- enzymes = {fd ["enzyme" ] for fd in file_data .values ()}
84- if len (enzymes ) > 1 :
85- raise ValueError (f"Multiple enzymes not supported : { enzymes } " )
86- enzyme = enzymes .pop ()
82+ # Enzyme set (tuple of normalized names) must be consistent across files.
83+ enzyme_sets = {fd ["enzyme" ] for fd in file_data .values ()}
84+ if len (enzyme_sets ) > 1 :
85+ raise ValueError (f"Inconsistent enzyme sets across files : { enzyme_sets } " )
86+ enzymes = enzyme_sets .pop () # tuple[str, ...]
8787
8888 # Get modifications (must be consistent across experiment)
8989 fixed_mods_set = {tuple (fd ["fixed_mods" ]) for fd in file_data .values ()}
@@ -131,14 +131,26 @@ def diann_convert(
131131
132132 # Write config file
133133 self ._write_config (
134- enzyme , diann_fixed , diann_var , plex_info , tolerance_summary , scan_range_summary , monitor_mods
134+ enzymes , diann_fixed , diann_var , plex_info , tolerance_summary , scan_range_summary , monitor_mods
135135 )
136136
137137 # Write filemap
138138 self ._write_filemap (file_data , plex_info , design_rows )
139139
140140 self .report_warnings ()
141141
142+ @staticmethod
143+ def _find_enzyme_columns (sdrf : pd .DataFrame ) -> list [str ]:
144+ """Return all `comment[cleavage agent details]` columns.
145+
146+ Includes pandas-renamed duplicates (e.g. `…].1`, `…].2`).
147+ """
148+ return [
149+ c
150+ for c in sdrf .columns
151+ if c == "comment[cleavage agent details]" or c .startswith ("comment[cleavage agent details]." )
152+ ]
153+
142154 def _extract_file_data (self , sdrf : pd .DataFrame ) -> dict :
143155 """Extract per-file metadata from SDRF rows.
144156
@@ -151,6 +163,9 @@ def _extract_file_data(self, sdrf: pd.DataFrame) -> dict:
151163 # Find modification columns
152164 mod_cols = [c for c in sdrf .columns if c .startswith ("comment[modification parameters" )]
153165
166+ # Find enzyme columns (handles pandas-renamed duplicates).
167+ enzyme_cols = self ._find_enzyme_columns (sdrf )
168+
154169 for _ , row in sdrf .iterrows ():
155170 raw = str (row .get ("comment[data file]" , "" )).strip ()
156171 if not raw :
@@ -183,9 +198,10 @@ def _extract_file_data(self, sdrf: pd.DataFrame) -> dict:
183198 if label and label not in fd ["labels" ]:
184199 fd ["labels" ].append (label )
185200
186- # Enzyme (first row wins)
201+ # Enzymes (first row wins). May be a tuple of multiple enzymes
202+ # when the SDRF declares more than one cleavage-agent column.
187203 if fd ["enzyme" ] is None :
188- fd ["enzyme" ] = self ._extract_enzyme (row )
204+ fd ["enzyme" ] = self ._extract_enzymes (row , enzyme_cols )
189205
190206 # Modifications (first row wins)
191207 if not fd ["fixed_mods" ] and not fd ["var_mods" ]:
@@ -337,21 +353,32 @@ def _extract_label(self, row: pd.Series) -> str:
337353
338354 return label_str
339355
340- def _extract_enzyme (self , row : pd .Series ) -> str :
341- """Extract enzyme from comment[cleavage agent details]."""
342- if "comment[cleavage agent details]" not in row .index :
356+ def _extract_enzymes (self , row : pd .Series , enzyme_cols : list [str ]) -> tuple [str , ...]:
357+ """Extract all declared enzymes for a row, in column order, deduplicated.
358+
359+ Skips empty / "not available" cells. Normalizes via ENZYME_NAME_MAPPINGS.
360+ Returns a tuple of normalized enzyme names. Raises ValueError if no
361+ cleavage agent column is provided, or if every cell is empty (preserves
362+ the prior single-column strictness).
363+ """
364+ if not enzyme_cols :
343365 raise ValueError ("Missing comment[cleavage agent details] column" )
344366
345- enzyme_str = str (row ["comment[cleavage agent details]" ]).strip ()
346- nt_match = re .search (r"NT=(.+?)(;|$)" , enzyme_str )
347- if nt_match :
348- enzyme_name = nt_match .group (1 ).strip ()
349- else :
350- enzyme_name = enzyme_str
367+ names : list [str ] = []
368+ for col in enzyme_cols :
369+ raw_val = str (row .get (col , "" )).strip ()
370+ if not raw_val or raw_val .lower () in ("nan" , "not available" ):
371+ continue
372+ nt_match = re .search (r"NT=(.+?)(;|$)" , raw_val )
373+ enzyme_name = nt_match .group (1 ).strip () if nt_match else raw_val
374+ normalized = ENZYME_NAME_MAPPINGS .get (enzyme_name .lower (), enzyme_name )
375+ if normalized not in names :
376+ names .append (normalized )
351377
352- # Normalize
353- normalized = ENZYME_NAME_MAPPINGS .get (enzyme_name .lower (), enzyme_name )
354- return normalized
378+ if not names :
379+ raise ValueError ("Row has no usable cleavage agent value" )
380+
381+ return tuple (names )
355382
356383 def _extract_modifications (self , row : pd .Series , mod_cols : list [str ]) -> tuple [list , list ]:
357384 """Extract fixed and variable modifications from SDRF row."""
@@ -588,9 +615,62 @@ def _resolve_monitor_mods(self, mod_localization: str) -> list[str]:
588615 )
589616 return unimod_ids
590617
618+ def _combine_cut_rules (self , enzymes : tuple [str , ...]) -> str | None :
619+ """Combine DIA-NN --cut rules across multiple enzymes.
620+
621+ - Positives (cleavage tokens) are unioned across enzymes (first-seen order).
622+ - Negations (e.g. !*P) are intersected: a "do not cleave" constraint
623+ only survives if EVERY contributing enzyme imposes it. This makes
624+ /P variants (which lack !*P) correctly relax the proline restriction.
625+ - Unknown enzymes (not in ENZYME_SPECIFICITY) are warned about and
626+ skipped. Returns None if every enzyme is unknown.
627+ """
628+ rules : list [str ] = []
629+ unknown : list [str ] = []
630+ for e in enzymes :
631+ rule = ENZYME_SPECIFICITY .get (e )
632+ if rule is None :
633+ unknown .append (e )
634+ else :
635+ rules .append (rule )
636+
637+ if unknown and rules :
638+ known = [e for e in enzymes if e not in unknown ]
639+ self .add_warning (
640+ f"Unknown enzyme(s) { unknown } in multi-enzyme SDRF — no --cut rule "
641+ f"available for them. Proceeding with known enzymes only: { known } ."
642+ )
643+
644+ if not rules :
645+ return None
646+
647+ positive_lists : list [list [str ]] = []
648+ negative_sets : list [set [str ]] = []
649+ for rule in rules :
650+ positives : list [str ] = []
651+ negatives : set [str ] = set ()
652+ for tok in (t .strip () for t in rule .split ("," )):
653+ if not tok :
654+ continue
655+ if tok .startswith ("!" ):
656+ negatives .add (tok )
657+ elif tok not in positives :
658+ positives .append (tok )
659+ positive_lists .append (positives )
660+ negative_sets .append (negatives )
661+
662+ merged_positives : list [str ] = []
663+ for pos in positive_lists :
664+ for tok in pos :
665+ if tok not in merged_positives :
666+ merged_positives .append (tok )
667+
668+ merged_negatives = set .intersection (* negative_sets ) if negative_sets else set ()
669+ return "," .join (merged_positives + sorted (merged_negatives ))
670+
591671 def _write_config (
592672 self ,
593- enzyme : str ,
673+ enzymes : tuple [ str , ...] ,
594674 fixed_mods : list [str ],
595675 var_mods : list [str ],
596676 plex_info : dict | None ,
@@ -601,12 +681,22 @@ def _write_config(
601681 """Write diann_config.cfg."""
602682 parts = []
603683
604- # Enzyme cut rule
605- cut_rule = ENZYME_SPECIFICITY .get (enzyme )
606- if cut_rule :
607- parts .append (f"--cut { cut_rule } " )
684+ # Enzyme cut rule. Single-enzyme path preserves the existing
685+ # "Unknown enzyme" warning; multi-enzyme path delegates to combiner.
686+ if len (enzymes ) == 1 :
687+ single = enzymes [0 ]
688+ cut_rule = ENZYME_SPECIFICITY .get (single )
689+ if cut_rule :
690+ parts .append (f"--cut { cut_rule } " )
691+ else :
692+ self .add_warning (f"Unknown enzyme '{ single } ', no --cut rule generated" )
608693 else :
609- self .add_warning (f"Unknown enzyme '{ enzyme } ', no --cut rule generated" )
694+ combined = self ._combine_cut_rules (enzymes )
695+ if combined :
696+ parts .append (f"--cut { combined } " )
697+ self .add_warning (f"Combined { len (enzymes )} cleavage agents { list (enzymes )} into --cut { combined } " )
698+ else :
699+ self .add_warning (f"All enzymes { list (enzymes )} unknown, no --cut rule generated" )
610700
611701 # Standard fixed modifications
612702 for mod in fixed_mods :
@@ -721,7 +811,7 @@ def _filemap_row(self, filename: str, fd: dict, label: str, label_type: str, des
721811 "DissociationMethod" : design ["dissociation_method" ] if design else "" ,
722812 "Condition" : design ["condition" ] if design else "" ,
723813 "BioReplicate" : design ["bioreplicate" ] if design else "" ,
724- "Enzyme" : fd ["enzyme" ],
814+ "Enzyme" : "+" . join ( fd ["enzyme" ]) ,
725815 "FixedModifications" : ";" .join (fd ["fixed_mods" ]),
726816 "VariableModifications" : ";" .join (fd ["var_mods" ]),
727817 "PrecursorMassTolerance" : fd ["precursor_tol" ] or "" ,
0 commit comments