Skip to content

Commit 163743e

Browse files
authored
Merge pull request #297 from bigbio/dev
Minor changes in the diann convert to fix bug with multiple enzymes
2 parents 7f8d770 + a5b3e8c commit 163743e

8 files changed

Lines changed: 195 additions & 27 deletions

.coderabbit.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
reviews:
2+
tools:
3+
pydocstyle:
4+
enabled: false

src/sdrf_pipelines/converters/diann/diann.py

Lines changed: 117 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -79,11 +79,11 @@ def diann_convert(
7979
all_labels.update(fd["labels"])
8080
plex_info = detect_plexdia_type(all_labels)
8181

82-
# Get enzyme (must be consistent across experiment)
83-
enzymes = {fd["enzyme"] for fd in file_data.values()}
84-
if len(enzymes) > 1:
85-
raise ValueError(f"Multiple enzymes not supported: {enzymes}")
86-
enzyme = enzymes.pop()
82+
# Enzyme set (tuple of normalized names) must be consistent across files.
83+
enzyme_sets = {fd["enzyme"] for fd in file_data.values()}
84+
if len(enzyme_sets) > 1:
85+
raise ValueError(f"Inconsistent enzyme sets across files: {enzyme_sets}")
86+
enzymes = enzyme_sets.pop() # tuple[str, ...]
8787

8888
# Get modifications (must be consistent across experiment)
8989
fixed_mods_set = {tuple(fd["fixed_mods"]) for fd in file_data.values()}
@@ -131,14 +131,26 @@ def diann_convert(
131131

132132
# Write config file
133133
self._write_config(
134-
enzyme, diann_fixed, diann_var, plex_info, tolerance_summary, scan_range_summary, monitor_mods
134+
enzymes, diann_fixed, diann_var, plex_info, tolerance_summary, scan_range_summary, monitor_mods
135135
)
136136

137137
# Write filemap
138138
self._write_filemap(file_data, plex_info, design_rows)
139139

140140
self.report_warnings()
141141

142+
@staticmethod
143+
def _find_enzyme_columns(sdrf: pd.DataFrame) -> list[str]:
144+
"""Return all `comment[cleavage agent details]` columns.
145+
146+
Includes pandas-renamed duplicates (e.g. `…].1`, `…].2`).
147+
"""
148+
return [
149+
c
150+
for c in sdrf.columns
151+
if c == "comment[cleavage agent details]" or c.startswith("comment[cleavage agent details].")
152+
]
153+
142154
def _extract_file_data(self, sdrf: pd.DataFrame) -> dict:
143155
"""Extract per-file metadata from SDRF rows.
144156
@@ -151,6 +163,9 @@ def _extract_file_data(self, sdrf: pd.DataFrame) -> dict:
151163
# Find modification columns
152164
mod_cols = [c for c in sdrf.columns if c.startswith("comment[modification parameters")]
153165

166+
# Find enzyme columns (handles pandas-renamed duplicates).
167+
enzyme_cols = self._find_enzyme_columns(sdrf)
168+
154169
for _, row in sdrf.iterrows():
155170
raw = str(row.get("comment[data file]", "")).strip()
156171
if not raw:
@@ -183,9 +198,10 @@ def _extract_file_data(self, sdrf: pd.DataFrame) -> dict:
183198
if label and label not in fd["labels"]:
184199
fd["labels"].append(label)
185200

186-
# Enzyme (first row wins)
201+
# Enzymes (first row wins). May be a tuple of multiple enzymes
202+
# when the SDRF declares more than one cleavage-agent column.
187203
if fd["enzyme"] is None:
188-
fd["enzyme"] = self._extract_enzyme(row)
204+
fd["enzyme"] = self._extract_enzymes(row, enzyme_cols)
189205

190206
# Modifications (first row wins)
191207
if not fd["fixed_mods"] and not fd["var_mods"]:
@@ -337,21 +353,32 @@ def _extract_label(self, row: pd.Series) -> str:
337353

338354
return label_str
339355

340-
def _extract_enzyme(self, row: pd.Series) -> str:
341-
"""Extract enzyme from comment[cleavage agent details]."""
342-
if "comment[cleavage agent details]" not in row.index:
356+
def _extract_enzymes(self, row: pd.Series, enzyme_cols: list[str]) -> tuple[str, ...]:
357+
"""Extract all declared enzymes for a row, in column order, deduplicated.
358+
359+
Skips empty / "not available" cells. Normalizes via ENZYME_NAME_MAPPINGS.
360+
Returns a tuple of normalized enzyme names. Raises ValueError if no
361+
cleavage agent column is provided, or if every cell is empty (preserves
362+
the prior single-column strictness).
363+
"""
364+
if not enzyme_cols:
343365
raise ValueError("Missing comment[cleavage agent details] column")
344366

345-
enzyme_str = str(row["comment[cleavage agent details]"]).strip()
346-
nt_match = re.search(r"NT=(.+?)(;|$)", enzyme_str)
347-
if nt_match:
348-
enzyme_name = nt_match.group(1).strip()
349-
else:
350-
enzyme_name = enzyme_str
367+
names: list[str] = []
368+
for col in enzyme_cols:
369+
raw_val = str(row.get(col, "")).strip()
370+
if not raw_val or raw_val.lower() in ("nan", "not available"):
371+
continue
372+
nt_match = re.search(r"NT=(.+?)(;|$)", raw_val)
373+
enzyme_name = nt_match.group(1).strip() if nt_match else raw_val
374+
normalized = ENZYME_NAME_MAPPINGS.get(enzyme_name.lower(), enzyme_name)
375+
if normalized not in names:
376+
names.append(normalized)
351377

352-
# Normalize
353-
normalized = ENZYME_NAME_MAPPINGS.get(enzyme_name.lower(), enzyme_name)
354-
return normalized
378+
if not names:
379+
raise ValueError("Row has no usable cleavage agent value")
380+
381+
return tuple(names)
355382

356383
def _extract_modifications(self, row: pd.Series, mod_cols: list[str]) -> tuple[list, list]:
357384
"""Extract fixed and variable modifications from SDRF row."""
@@ -588,9 +615,62 @@ def _resolve_monitor_mods(self, mod_localization: str) -> list[str]:
588615
)
589616
return unimod_ids
590617

618+
def _combine_cut_rules(self, enzymes: tuple[str, ...]) -> str | None:
619+
"""Combine DIA-NN --cut rules across multiple enzymes.
620+
621+
- Positives (cleavage tokens) are unioned across enzymes (first-seen order).
622+
- Negations (e.g. !*P) are intersected: a "do not cleave" constraint
623+
only survives if EVERY contributing enzyme imposes it. This makes
624+
/P variants (which lack !*P) correctly relax the proline restriction.
625+
- Unknown enzymes (not in ENZYME_SPECIFICITY) are warned about and
626+
skipped. Returns None if every enzyme is unknown.
627+
"""
628+
rules: list[str] = []
629+
unknown: list[str] = []
630+
for e in enzymes:
631+
rule = ENZYME_SPECIFICITY.get(e)
632+
if rule is None:
633+
unknown.append(e)
634+
else:
635+
rules.append(rule)
636+
637+
if unknown and rules:
638+
known = [e for e in enzymes if e not in unknown]
639+
self.add_warning(
640+
f"Unknown enzyme(s) {unknown} in multi-enzyme SDRF — no --cut rule "
641+
f"available for them. Proceeding with known enzymes only: {known}."
642+
)
643+
644+
if not rules:
645+
return None
646+
647+
positive_lists: list[list[str]] = []
648+
negative_sets: list[set[str]] = []
649+
for rule in rules:
650+
positives: list[str] = []
651+
negatives: set[str] = set()
652+
for tok in (t.strip() for t in rule.split(",")):
653+
if not tok:
654+
continue
655+
if tok.startswith("!"):
656+
negatives.add(tok)
657+
elif tok not in positives:
658+
positives.append(tok)
659+
positive_lists.append(positives)
660+
negative_sets.append(negatives)
661+
662+
merged_positives: list[str] = []
663+
for pos in positive_lists:
664+
for tok in pos:
665+
if tok not in merged_positives:
666+
merged_positives.append(tok)
667+
668+
merged_negatives = set.intersection(*negative_sets) if negative_sets else set()
669+
return ",".join(merged_positives + sorted(merged_negatives))
670+
591671
def _write_config(
592672
self,
593-
enzyme: str,
673+
enzymes: tuple[str, ...],
594674
fixed_mods: list[str],
595675
var_mods: list[str],
596676
plex_info: dict | None,
@@ -601,12 +681,22 @@ def _write_config(
601681
"""Write diann_config.cfg."""
602682
parts = []
603683

604-
# Enzyme cut rule
605-
cut_rule = ENZYME_SPECIFICITY.get(enzyme)
606-
if cut_rule:
607-
parts.append(f"--cut {cut_rule}")
684+
# Enzyme cut rule. Single-enzyme path preserves the existing
685+
# "Unknown enzyme" warning; multi-enzyme path delegates to combiner.
686+
if len(enzymes) == 1:
687+
single = enzymes[0]
688+
cut_rule = ENZYME_SPECIFICITY.get(single)
689+
if cut_rule:
690+
parts.append(f"--cut {cut_rule}")
691+
else:
692+
self.add_warning(f"Unknown enzyme '{single}', no --cut rule generated")
608693
else:
609-
self.add_warning(f"Unknown enzyme '{enzyme}', no --cut rule generated")
694+
combined = self._combine_cut_rules(enzymes)
695+
if combined:
696+
parts.append(f"--cut {combined}")
697+
self.add_warning(f"Combined {len(enzymes)} cleavage agents {list(enzymes)} into --cut {combined}")
698+
else:
699+
self.add_warning(f"All enzymes {list(enzymes)} unknown, no --cut rule generated")
610700

611701
# Standard fixed modifications
612702
for mod in fixed_mods:
@@ -721,7 +811,7 @@ def _filemap_row(self, filename: str, fd: dict, label: str, label_type: str, des
721811
"DissociationMethod": design["dissociation_method"] if design else "",
722812
"Condition": design["condition"] if design else "",
723813
"BioReplicate": design["bioreplicate"] if design else "",
724-
"Enzyme": fd["enzyme"],
814+
"Enzyme": "+".join(fd["enzyme"]),
725815
"FixedModifications": ";".join(fd["fixed_mods"]),
726816
"VariableModifications": ";".join(fd["var_mods"]),
727817
"PrecursorMassTolerance": fd["precursor_tol"] or "",
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
source name characteristics[organism] assay name comment[label] comment[instrument] comment[cleavage agent details] comment[cleavage agent details] comment[modification parameters] comment[modification parameters] comment[precursor mass tolerance] comment[fragment mass tolerance] comment[data file]
2+
Sample 1 Homo sapiens run 1 AC=MS:1002038;NT=label free sample AC=MS:1001742;NT=LTQ Orbitrap Velos NT=Lys-C;AC=MS:1001309 NT=Trypsin;AC=MS:1001251 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 10 ppm 20 ppm sample1.raw
3+
Sample 2 Homo sapiens run 2 AC=MS:1002038;NT=label free sample AC=MS:1001742;NT=LTQ Orbitrap Velos NT=Trypsin;AC=MS:1001251 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 5 ppm 15 ppm sample2.raw
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
source name characteristics[organism] assay name comment[label] comment[instrument] comment[cleavage agent details] comment[cleavage agent details] comment[modification parameters] comment[modification parameters] comment[precursor mass tolerance] comment[fragment mass tolerance] comment[data file]
2+
Sample 1 Homo sapiens run 1 AC=MS:1002038;NT=label free sample AC=MS:1001742;NT=LTQ Orbitrap Velos NT=Lys-C;AC=MS:1001309 NT=Trypsin;AC=MS:1001251 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 10 ppm 20 ppm sample1.raw
3+
Sample 2 Homo sapiens run 2 AC=MS:1002038;NT=label free sample AC=MS:1001742;NT=LTQ Orbitrap Velos NT=Lys-C;AC=MS:1001309 NT=Trypsin;AC=MS:1001251 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 5 ppm 15 ppm sample2.raw
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
source name characteristics[organism] assay name comment[label] comment[instrument] comment[cleavage agent details] comment[cleavage agent details] comment[modification parameters] comment[modification parameters] comment[precursor mass tolerance] comment[fragment mass tolerance] comment[data file]
2+
Sample 1 Homo sapiens run 1 AC=MS:1002038;NT=label free sample AC=MS:1001742;NT=LTQ Orbitrap Velos NT=Lys-C;AC=MS:1001309 NT=Trypsin/P;AC=MS:1001313 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 10 ppm 20 ppm sample1.raw
3+
Sample 2 Homo sapiens run 2 AC=MS:1002038;NT=label free sample AC=MS:1001742;NT=LTQ Orbitrap Velos NT=Lys-C;AC=MS:1001309 NT=Trypsin/P;AC=MS:1001313 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 5 ppm 15 ppm sample2.raw
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
source name characteristics[organism] assay name comment[label] comment[instrument] comment[cleavage agent details] comment[cleavage agent details] comment[modification parameters] comment[modification parameters] comment[precursor mass tolerance] comment[fragment mass tolerance] comment[data file]
2+
Sample 1 Homo sapiens run 1 AC=MS:1002038;NT=label free sample AC=MS:1001742;NT=LTQ Orbitrap Velos NT=Trypsin;AC=MS:1001251 NT=Trypsin;AC=MS:1001251 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 10 ppm 20 ppm sample1.raw
3+
Sample 2 Homo sapiens run 2 AC=MS:1002038;NT=label free sample AC=MS:1001742;NT=LTQ Orbitrap Velos NT=Trypsin;AC=MS:1001251 NT=Trypsin;AC=MS:1001251 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 5 ppm 15 ppm sample2.raw
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
source name characteristics[organism] assay name comment[label] comment[instrument] comment[cleavage agent details] comment[cleavage agent details] comment[modification parameters] comment[modification parameters] comment[precursor mass tolerance] comment[fragment mass tolerance] comment[data file]
2+
Sample 1 Homo sapiens run 1 AC=MS:1002038;NT=label free sample AC=MS:1001742;NT=LTQ Orbitrap Velos NT=BogusProtease;AC=MS:9999999 NT=Trypsin;AC=MS:1001251 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 10 ppm 20 ppm sample1.raw
3+
Sample 2 Homo sapiens run 2 AC=MS:1002038;NT=label free sample AC=MS:1001742;NT=LTQ Orbitrap Velos NT=BogusProtease;AC=MS:9999999 NT=Trypsin;AC=MS:1001251 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 5 ppm 15 ppm sample2.raw

tests/test_convert_diann.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,3 +394,62 @@ class TestDiannScanRangeValidation:
394394
def test_inverted_scan_range_raises_error(self, diann_data_dir, on_tmpdir):
395395
with pytest.raises(ValueError, match="[Ii]nverted|[Mm]in.*greater.*max"):
396396
DiaNN().diann_convert(str(diann_data_dir / "scan_range_inverted.sdrf.tsv"))
397+
398+
399+
class TestDiannMultiEnzyme:
400+
def test_lys_c_trypsin_combined_cut_rule(self, diann_data_dir, on_tmpdir):
401+
sdrf_file = str(diann_data_dir / "multi_enzyme_lys_c_trypsin.sdrf.tsv")
402+
converter = DiaNN()
403+
converter.diann_convert(sdrf_file)
404+
405+
config = (on_tmpdir / "diann_config.cfg").read_text()
406+
assert "--cut K*,R*,!*P" in config
407+
408+
df = pd.read_csv(on_tmpdir / "diann_design.tsv", sep="\t")
409+
assert all(df["Enzyme"] == "Lys-C+Trypsin")
410+
411+
def test_lys_c_trypsin_p_drops_negation(self, diann_data_dir, on_tmpdir):
412+
"""Trypsin/P has no !*P; intersection must drop the negation."""
413+
sdrf_file = str(diann_data_dir / "multi_enzyme_lys_c_trypsin_p.sdrf.tsv")
414+
converter = DiaNN()
415+
converter.diann_convert(sdrf_file)
416+
417+
config = (on_tmpdir / "diann_config.cfg").read_text()
418+
cut_section = config.split("--cut ", 1)[1].split(" --", 1)[0]
419+
assert cut_section == "K*,R*"
420+
421+
df = pd.read_csv(on_tmpdir / "diann_design.tsv", sep="\t")
422+
assert all(df["Enzyme"] == "Lys-C+Trypsin/P")
423+
424+
def test_unknown_enzyme_warns_and_proceeds(self, diann_data_dir, on_tmpdir):
425+
"""Unknown enzyme alongside a known one: warn, drop unknown, keep going."""
426+
sdrf_file = str(diann_data_dir / "multi_enzyme_unknown.sdrf.tsv")
427+
converter = DiaNN()
428+
converter.diann_convert(sdrf_file)
429+
430+
config = (on_tmpdir / "diann_config.cfg").read_text()
431+
assert "--cut K*,R*,!*P" in config
432+
433+
assert any("BogusProtease" in msg for msg in converter.warnings)
434+
435+
df = pd.read_csv(on_tmpdir / "diann_design.tsv", sep="\t")
436+
assert all(df["Enzyme"] == "BogusProtease+Trypsin")
437+
438+
def test_inconsistent_enzyme_sets_raises(self, diann_data_dir, on_tmpdir):
439+
"""Different enzyme tuples per file must raise ValueError."""
440+
sdrf_file = str(diann_data_dir / "multi_enzyme_inconsistent.sdrf.tsv")
441+
converter = DiaNN()
442+
with pytest.raises(ValueError, match="Inconsistent enzyme sets"):
443+
converter.diann_convert(sdrf_file)
444+
445+
def test_same_enzyme_twice_dedups(self, diann_data_dir, on_tmpdir):
446+
"""Two columns declaring Trypsin must collapse to a single-enzyme run."""
447+
sdrf_file = str(diann_data_dir / "multi_enzyme_same.sdrf.tsv")
448+
converter = DiaNN()
449+
converter.diann_convert(sdrf_file)
450+
451+
config = (on_tmpdir / "diann_config.cfg").read_text()
452+
assert "--cut K*,R*,!*P" in config
453+
454+
df = pd.read_csv(on_tmpdir / "diann_design.tsv", sep="\t")
455+
assert all(df["Enzyme"] == "Trypsin")

0 commit comments

Comments
 (0)