Skip to content

Commit aecaa84

Browse files
authored
IMP: Adds warnings when importing potentially problematic TSVTaxonomyFormat files. (#379)
1 parent 3b335ce commit aecaa84

4 files changed

Lines changed: 69 additions & 2 deletions

File tree

q2_types/feature_data/_deferred_setup/_validators.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,17 @@
66
# The full license is in the file LICENSE, distributed with this software.
77
# ----------------------------------------------------------------------------
88

9+
import warnings
910
import pandas as pd
1011

1112
from qiime2.plugin import Properties, ValidationError
1213

13-
from .. import FeatureData, SequenceCharacteristics
14+
from .. import FeatureData, SequenceCharacteristics, Taxonomy
1415

1516
from ...plugin_setup import plugin
1617

18+
from qiime2.core.exceptions import RachisWarning
19+
1720

1821
@plugin.register_validator(FeatureData[SequenceCharacteristics %
1922
Properties("length")])
@@ -37,3 +40,34 @@ def validate_seq_char_len(data: pd.DataFrame, level):
3740
if not (data['length'] > 0).all():
3841
raise ValidationError('Column "length" cannot contain negative '
3942
'values.')
43+
44+
45+
@plugin.register_validator(FeatureData[Taxonomy])
46+
def _check_single_taxon(data: pd.DataFrame, level):
47+
'''
48+
Raises a warning if the taxonomy's depth is one.
49+
'''
50+
max_depth = 0
51+
for taxon in data['Taxon']:
52+
if taxon.count(';') > max_depth:
53+
max_depth = taxon.count(';')
54+
55+
if max_depth == 0:
56+
warnings.warn(
57+
'Importing taxonomy with taxonomic depth of one.',
58+
RachisWarning
59+
)
60+
61+
62+
@plugin.register_validator(FeatureData[Taxonomy])
63+
def _check_trailing_semicolon(data: pd.DataFrame, level):
64+
'''
65+
Raises a warning if a taxon string in the taxonomy has a trailing
66+
semicolon.
67+
'''
68+
for taxon in data['Taxon']:
69+
if taxon.rstrip().endswith(';'):
70+
warnings.warn(
71+
'Importing taxonomy with a trailing semicolon.',
72+
RachisWarning
73+
)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Feature ID Taxon
2+
seq1; k__Bacteria
3+
seq2; k__Archaea
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Feature ID Taxon Confidence
2+
seq1 k__Bacteria 0.3
3+
seq2; k__Bacteria; p__Proteobacteria 0.5
4+
seq3 k__Bacteria; p__Proteobacteria; 0.5

q2_types/feature_data/tests/test_formats.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@
3535
)
3636
from qiime2.plugin.testing import TestPluginBase
3737
from qiime2.plugin import ValidationError
38+
from qiime2.core.exceptions import RachisWarning
39+
from qiime2.sdk.result import Artifact
3840

3941

4042
class TestTaxonomyFormats(TestPluginBase):
@@ -48,7 +50,6 @@ def test_taxonomy_format_validate_positive(self):
4850

4951
for filepath in filepaths:
5052
format = TaxonomyFormat(filepath, mode='r')
51-
5253
format.validate()
5354

5455
def test_taxonomy_format_validate_negative(self):
@@ -116,6 +117,31 @@ def test_headerless_tsv_taxonomy_directory_format(self):
116117

117118
format.validate()
118119

120+
def test_tsv_taxonomy_format_warnings(self):
121+
'''
122+
Tests that the taxonomic depth of one and trailing semicolon warnings
123+
are raised under the proper conditions.
124+
'''
125+
filepath = self.get_data_path(
126+
os.path.join('taxonomy', 'one-depth.tsv')
127+
)
128+
format = TSVTaxonomyFormat(filepath, mode='r')
129+
130+
with self.assertWarnsRegex(
131+
RachisWarning, 'Importing taxonomy with taxonomic depth of one.'
132+
):
133+
Artifact.import_data('FeatureData[Taxonomy]', format)
134+
135+
filepath = self.get_data_path(
136+
os.path.join('taxonomy', 'trailing-semicolon.tsv')
137+
)
138+
format = TSVTaxonomyFormat(filepath, mode='r')
139+
140+
with self.assertWarnsRegex(
141+
RachisWarning, 'Importing taxonomy with a trailing semicolon.'
142+
):
143+
Artifact.import_data('FeatureData[Taxonomy]', format)
144+
119145
def test_tsv_taxonomy_format_validate_positive(self):
120146
filenames = ['2-column.tsv', '3-column.tsv', 'valid-but-messy.tsv',
121147
'many-rows.tsv']

0 commit comments

Comments
 (0)