Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions q2_types/feature_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
TaxonomyFormat, TaxonomyDirectoryFormat, HeaderlessTSVTaxonomyFormat,
HeaderlessTSVTaxonomyDirectoryFormat, TSVTaxonomyFormat,
TSVTaxonomyDirectoryFormat, DNAFASTAFormat, DNASequencesDirectoryFormat,
LinkedDNAFASTAFormat, LinkedDNASequencesDirectoryFormat,
PairedDNASequencesDirectoryFormat, AlignedDNAFASTAFormat,
AlignedDNASequencesDirectoryFormat, DifferentialFormat,
DifferentialDirectoryFormat, FASTAFormat, AlignedFASTAFormatMixin,
Expand All @@ -29,7 +30,8 @@
SequenceCharacteristicsDirectoryFormat,
SequenceCharacteristicsFormat)
from ._types import (
FeatureData, Taxonomy, Sequence, PairedEndSequence, AlignedSequence,
FeatureData, Taxonomy, Sequence, LinkedSequence, PairedEndSequence,
AlignedSequence,
Differential, ProteinSequence, AlignedProteinSequence, RNASequence,
AlignedRNASequence, PairedEndRNASequence, BLAST6, SequenceCharacteristics)
from ._objects import (
Expand All @@ -41,9 +43,11 @@
'TaxonomyFormat', 'TaxonomyDirectoryFormat', 'HeaderlessTSVTaxonomyFormat',
'HeaderlessTSVTaxonomyDirectoryFormat', 'TSVTaxonomyFormat',
'TSVTaxonomyDirectoryFormat', 'DNAFASTAFormat', 'DifferentialFormat',
'DNASequencesDirectoryFormat', 'PairedDNASequencesDirectoryFormat',
'DNASequencesDirectoryFormat', 'LinkedDNAFASTAFormat',
'LinkedDNASequencesDirectoryFormat', 'PairedDNASequencesDirectoryFormat',
'AlignedDNAFASTAFormat', 'AlignedDNASequencesDirectoryFormat',
'FeatureData', 'Taxonomy', 'Sequence', 'PairedEndSequence',
'FeatureData', 'Taxonomy', 'Sequence', 'LinkedSequence',
'PairedEndSequence',
'AlignedSequence', 'NucleicAcidIterator', 'DNAIterator',
'PairedDNAIterator', 'FASTAFormat', 'AlignedDNAIterator', 'Differential',
'DifferentialDirectoryFormat', 'AlignedFASTAFormatMixin',
Expand Down
27 changes: 24 additions & 3 deletions q2_types/feature_data/_deferred_setup/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
TaxonomyFormat, TaxonomyDirectoryFormat, HeaderlessTSVTaxonomyFormat,
HeaderlessTSVTaxonomyDirectoryFormat, TSVTaxonomyFormat,
TSVTaxonomyDirectoryFormat, DNAFASTAFormat, DNASequencesDirectoryFormat,
LinkedDNAFASTAFormat, LinkedDNASequencesDirectoryFormat,
PairedDNASequencesDirectoryFormat, AlignedDNAFASTAFormat,
AlignedDNASequencesDirectoryFormat, DifferentialFormat,
DifferentialDirectoryFormat, FASTAFormat,
Expand All @@ -30,7 +31,8 @@
MixedCaseAlignedRNASequencesDirectoryFormat,
SequenceCharacteristicsDirectoryFormat,
SequenceCharacteristicsFormat,
FeatureData, Taxonomy, Sequence, PairedEndSequence, AlignedSequence,
FeatureData, Taxonomy, Sequence, LinkedSequence, PairedEndSequence,
AlignedSequence,
Differential, ProteinSequence, AlignedProteinSequence, RNASequence,
AlignedRNASequence, PairedEndRNASequence, BLAST6, SequenceCharacteristics)

Expand All @@ -41,7 +43,8 @@
TSVTaxonomyFormat, TSVTaxonomyDirectoryFormat,
HeaderlessTSVTaxonomyFormat, HeaderlessTSVTaxonomyDirectoryFormat,
TaxonomyFormat, TaxonomyDirectoryFormat, FASTAFormat, DNAFASTAFormat,
DNASequencesDirectoryFormat, PairedDNASequencesDirectoryFormat,
LinkedDNAFASTAFormat, DNASequencesDirectoryFormat,
LinkedDNASequencesDirectoryFormat, PairedDNASequencesDirectoryFormat,
AlignedDNAFASTAFormat, AlignedDNASequencesDirectoryFormat,
DifferentialFormat, DifferentialDirectoryFormat, ProteinFASTAFormat,
AlignedProteinFASTAFormat, MixedCaseProteinFASTAFormat,
Expand All @@ -61,6 +64,7 @@
)

plugin.register_semantic_types(FeatureData, Taxonomy, Sequence,
LinkedSequence,
PairedEndSequence, AlignedSequence,
Differential, ProteinSequence,
AlignedProteinSequence, RNASequence,
Expand All @@ -84,6 +88,17 @@
"sequence). Exactly one sequence is associated with each "
"feature identifier."))

plugin.register_artifact_class(
FeatureData[LinkedSequence],
directory_format=LinkedDNASequencesDirectoryFormat,
description=(
"Unaligned DNA sequences associated with a set of feature "
"identifiers. Each sequence represents either a merged pair or an "
"unmerged pair of reads. Unmerged pairs have a single space "
"separating the forward and reverse reads."
)
)

plugin.register_artifact_class(
FeatureData[RNASequence],
directory_format=RNASequencesDirectoryFormat,
Expand All @@ -93,7 +108,13 @@

plugin.register_artifact_class(
FeatureData[PairedEndSequence],
directory_format=PairedDNASequencesDirectoryFormat)
directory_format=PairedDNASequencesDirectoryFormat,
description=(
"Unaligned DNA sequences associated with a set of feature "
"identifiers. One forward sequence and one reverse sequence is "
"associated with each feature identifier."
)
)

plugin.register_artifact_class(
FeatureData[PairedEndRNASequence],
Expand Down
15 changes: 14 additions & 1 deletion q2_types/feature_data/_deferred_setup/_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

from .. import (
TaxonomyFormat, HeaderlessTSVTaxonomyFormat, TSVTaxonomyFormat,
DNAFASTAFormat, PairedDNASequencesDirectoryFormat,
DNAFASTAFormat, LinkedDNAFASTAFormat, PairedDNASequencesDirectoryFormat,
AlignedDNAFASTAFormat, DifferentialFormat, ProteinFASTAFormat,
AlignedProteinFASTAFormat, RNAFASTAFormat,
AlignedRNAFASTAFormat, PairedRNASequencesDirectoryFormat,
Expand Down Expand Up @@ -316,6 +316,19 @@ def _10(data: DNAIterator) -> DNAFASTAFormat:
return ff


@plugin.register_transformer
def _231(ff: LinkedDNAFASTAFormat) -> DNAIterator:
Comment thread
colinvwood marked this conversation as resolved.
generator = read_from_fasta(str(ff), skbio.Sequence)
return DNAIterator(generator)


@plugin.register_transformer
def _232(data: DNAIterator) -> LinkedDNAFASTAFormat:
ff = LinkedDNAFASTAFormat()
skbio.io.write(iter(data), format='fasta', into=str(ff))
return ff


@plugin.register_transformer
def _11(df: PairedDNASequencesDirectoryFormat) -> PairedDNAIterator:
left = df.left_dna_sequences.view(DNAIterator)
Expand Down
31 changes: 31 additions & 0 deletions q2_types/feature_data/_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,31 @@ def __init__(self, *args, **kwargs):
self.alphabet = "ACGTRYKMSWBDHVN"


class LinkedDNAFASTAFormat(FASTAFormat):
'''
Linked sequences are paired end sequences that may contain a single
space between two unmerged read directions. A space is not a valid
FASTA character, so this format is technically not FASTA (fastalmost).
'''
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.alphabet = "ACGTRYKMSWBDHVN "
Comment thread
colinvwood marked this conversation as resolved.
Outdated

def _validate_(self, level='min'):
'''
Ensure at most one space in each sequence record.
Comment thread
colinvwood marked this conversation as resolved.
Outdated
'''
super()._validate_(level)

with open(self.path, 'r') as fh:
for line_number, line in enumerate(fh):
if not line.startswith('>') and line.count(' ') > 1:
raise ValidationError(
'Expected at most one space in each sequence record. '
f'Invalid sequence on line {line_number + 1}: {line}.'
)


class AlignedFASTAFormatMixin:
def _turn_into_alignment(self):
self.aligned = True
Expand All @@ -296,6 +321,12 @@ def _validate_line_lengths(
DNASequencesDirectoryFormat = model.SingleFileDirectoryFormat(
'DNASequencesDirectoryFormat', 'dna-sequences.fasta', DNAFASTAFormat)

LinkedDNASequencesDirectoryFormat = model.SingleFileDirectoryFormat(
'LinkedDNASequencesDirectoryFormat',
'linked-dna-sequences.fasta',
LinkedDNAFASTAFormat
)


class MixedCaseDNAFASTAFormat(DNAFASTAFormat):
def __init__(self, *args, **kwargs):
Expand Down
3 changes: 3 additions & 0 deletions q2_types/feature_data/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@

Sequence = SemanticType('Sequence', variant_of=FeatureData.field['type'])

LinkedSequence = SemanticType('LinkedSequence',
variant_of=FeatureData.field['type'])

RNASequence = SemanticType('RNASequence', variant_of=FeatureData.field['type'])

PairedEndSequence = SemanticType('PairedEndSequence',
Expand Down
64 changes: 64 additions & 0 deletions q2_types/feature_data/tests/test_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
TaxonomyFormat, TaxonomyDirectoryFormat, HeaderlessTSVTaxonomyFormat,
HeaderlessTSVTaxonomyDirectoryFormat, TSVTaxonomyFormat,
TSVTaxonomyDirectoryFormat, DNAFASTAFormat, DNASequencesDirectoryFormat,
LinkedDNAFASTAFormat, LinkedDNASequencesDirectoryFormat,
PairedDNASequencesDirectoryFormat, AlignedDNAFASTAFormat,
AlignedDNASequencesDirectoryFormat, DifferentialDirectoryFormat,
ProteinFASTAFormat, AlignedProteinFASTAFormat, FASTAFormat,
Expand Down Expand Up @@ -271,6 +272,69 @@ def test_dna_sequences_directory_format(self):

format.validate()

def test_linked_dna_fasta_format_validate_positive(self):
filepath = os.path.join(
self.temp_dir.name, 'linked-dna-sequences.fasta'
)
with open(filepath, 'w') as fh:
fh.write('>id1\n')
fh.write('ACGTACGT\n')
fh.write('>id2\n')
fh.write('ACGT ACGT\n')

format = LinkedDNAFASTAFormat(filepath, mode='r')
format.validate()

def test_linked_dna_fasta_format_invalid_characters(self):
filepath = os.path.join(self.temp_dir.name, 'linked-invalid.fasta')
with open(filepath, 'w') as fh:
fh.write('>id1\n')
fh.write('ACGT+ACGT\n')

format = LinkedDNAFASTAFormat(filepath, mode='r')
with self.assertRaisesRegex(
ValidationError, "Invalid character '\\+'"
):
format.validate()

def test_linked_dna_sequences_directory_format(self):
filepath = os.path.join(
self.temp_dir.name, 'linked-dna-sequences.fasta'
)
with open(filepath, 'w') as fh:
fh.write('>id1\n')
fh.write('ACGT ACGT\n')

format = LinkedDNASequencesDirectoryFormat(
self.temp_dir.name, mode='r'
)
format.validate()

def test_linked_dna_fasta_format_spaces_in_header(self):
filepath = os.path.join(
self.temp_dir.name, 'linked-dna-sequences.fasta'
)
with open(filepath, 'w') as fh:
fh.write('>id1 with many spaces\n')
fh.write('ACGT ACGT\n')

format = LinkedDNAFASTAFormat(filepath, mode='r')
format.validate()

def test_linked_dna_sequences_too_many_spaces_in_sequence(self):
filepath = os.path.join(
self.temp_dir.name, 'linked-dna-sequences.fasta'
)
with open(filepath, 'w') as fh:
fh.write('>id1 with many spaces\n')
fh.write('ACGT ACGT\n')

format = LinkedDNAFASTAFormat(filepath, mode='r')
with self.assertRaisesRegex(
ValidationError, "Expected at most one space.*2.*ACGT ACGT"
):
format.validate()

def test_dna_fasta_format_duplicate_ids(self):
filepath = self.get_data_path('dna-sequences-duplicate-id.fasta')
format = DNAFASTAFormat(filepath, mode='r')
Expand Down
35 changes: 34 additions & 1 deletion q2_types/feature_data/tests/test_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from q2_types.feature_table import BIOMV210Format
from q2_types.feature_data import (
TaxonomyFormat, HeaderlessTSVTaxonomyFormat, TSVTaxonomyFormat,
DNAFASTAFormat, DNAIterator, PairedDNAIterator,
DNAFASTAFormat, LinkedDNAFASTAFormat, DNAIterator, PairedDNAIterator,
ProteinIterator, AlignedProteinIterator,
PairedDNASequencesDirectoryFormat, AlignedDNAFASTAFormat,
DifferentialFormat, AlignedDNAIterator, ProteinFASTAFormat,
Expand Down Expand Up @@ -563,6 +563,39 @@ def test_dna_iterator_to_dna_fasta_format(self):
for act, exp in zip(obs, input):
self.assertEqual(act, exp)

def test_linked_dna_fasta_format_to_dna_iterator(self):
filepath = os.path.join(self.temp_dir.name, 'linked-dna.fasta')
with open(filepath, 'w') as fh:
fh.write('>id1\n')
fh.write('ACGT ACGT\n')
fh.write('>id2\n')
fh.write('ACGT\n')

transformer = self.get_transformer(LinkedDNAFASTAFormat, DNAIterator)
obs = list(transformer(LinkedDNAFASTAFormat(filepath, mode='r')))

self.assertEqual([seq.metadata['id'] for seq in obs], ['id1', 'id2'])
self.assertEqual([str(seq) for seq in obs], ['ACGT ACGT', 'ACGT'])
self.assertTrue(all(type(seq) is skbio.Sequence for seq in obs))

def test_dna_iterator_to_linked_dna_fasta_format(self):
transformer = self.get_transformer(DNAIterator, LinkedDNAFASTAFormat)
input = DNAIterator(iter([
skbio.Sequence('ACGT ACGT', metadata={'id': 'id1'}),
skbio.Sequence('ACGT', metadata={'id': 'id2'}),
]))

obs = transformer(input)
self.assertIsInstance(obs, LinkedDNAFASTAFormat)

reread = list(
skbio.read(str(obs), format='fasta', constructor=skbio.Sequence)
)
self.assertEqual(
[seq.metadata['id'] for seq in reread], ['id1', 'id2']
)
self.assertEqual([str(seq) for seq in reread], ['ACGT ACGT', 'ACGT'])

def test_aln_dna_fasta_format_to_aln_dna_iterator(self):
filename = 'aligned-dna-sequences.fasta'
input, obs = self.transform_format(AlignedDNAFASTAFormat,
Expand Down
12 changes: 11 additions & 1 deletion q2_types/feature_data/tests/test_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@


from q2_types.feature_data import (
FeatureData, Taxonomy, Sequence, PairedEndSequence, AlignedSequence,
FeatureData, Taxonomy, Sequence, LinkedSequence, PairedEndSequence,
AlignedSequence,
Differential, TSVTaxonomyDirectoryFormat, DNASequencesDirectoryFormat,
LinkedDNASequencesDirectoryFormat,
DifferentialDirectoryFormat, PairedDNASequencesDirectoryFormat,
AlignedDNASequencesDirectoryFormat, ProteinSequencesDirectoryFormat,
AlignedProteinSequencesDirectoryFormat, ProteinSequence,
Expand All @@ -36,6 +38,9 @@ def test_taxonomy_semantic_type_registration(self):
def test_sequence_semantic_type_registration(self):
self.assertRegisteredSemanticType(Sequence)

def test_linked_sequence_semantic_type_registration(self):
self.assertRegisteredSemanticType(LinkedSequence)

def test_paired_end_sequence_semantic_type_registration(self):
self.assertRegisteredSemanticType(PairedEndSequence)

Expand Down Expand Up @@ -63,6 +68,11 @@ def test_sequence_semantic_type_to_format_registration(self):
self.assertSemanticTypeRegisteredToFormat(
FeatureData[Sequence], DNASequencesDirectoryFormat)

def test_linked_sequence_semantic_type_to_format_registration(self):
self.assertSemanticTypeRegisteredToFormat(
FeatureData[LinkedSequence], LinkedDNASequencesDirectoryFormat
)

def test_paired_end_sequence_semantic_type_to_format_registration(self):
self.assertSemanticTypeRegisteredToFormat(
FeatureData[PairedEndSequence],
Expand Down
Loading