Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions q2_types/feature_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
TaxonomyFormat, TaxonomyDirectoryFormat, HeaderlessTSVTaxonomyFormat,
HeaderlessTSVTaxonomyDirectoryFormat, TSVTaxonomyFormat,
TSVTaxonomyDirectoryFormat, DNAFASTAFormat, DNASequencesDirectoryFormat,
LinkedDNAFASTAFormat, LinkedDNASequencesDirectoryFormat,
PairedDNASequencesDirectoryFormat, AlignedDNAFASTAFormat,
AlignedDNASequencesDirectoryFormat, DifferentialFormat,
DifferentialDirectoryFormat, FASTAFormat, AlignedFASTAFormatMixin,
Expand All @@ -29,7 +30,8 @@
SequenceCharacteristicsDirectoryFormat,
SequenceCharacteristicsFormat)
from ._types import (
FeatureData, Taxonomy, Sequence, PairedEndSequence, AlignedSequence,
FeatureData, Taxonomy, Sequence, LinkedSequence, PairedEndSequence,
AlignedSequence,
Differential, ProteinSequence, AlignedProteinSequence, RNASequence,
AlignedRNASequence, PairedEndRNASequence, BLAST6, SequenceCharacteristics)
from ._objects import (
Expand All @@ -41,9 +43,11 @@
'TaxonomyFormat', 'TaxonomyDirectoryFormat', 'HeaderlessTSVTaxonomyFormat',
'HeaderlessTSVTaxonomyDirectoryFormat', 'TSVTaxonomyFormat',
'TSVTaxonomyDirectoryFormat', 'DNAFASTAFormat', 'DifferentialFormat',
'DNASequencesDirectoryFormat', 'PairedDNASequencesDirectoryFormat',
'DNASequencesDirectoryFormat', 'LinkedDNAFASTAFormat',
'LinkedDNASequencesDirectoryFormat', 'PairedDNASequencesDirectoryFormat',
'AlignedDNAFASTAFormat', 'AlignedDNASequencesDirectoryFormat',
'FeatureData', 'Taxonomy', 'Sequence', 'PairedEndSequence',
'FeatureData', 'Taxonomy', 'Sequence', 'LinkedSequence',
'PairedEndSequence',
'AlignedSequence', 'NucleicAcidIterator', 'DNAIterator',
'PairedDNAIterator', 'FASTAFormat', 'AlignedDNAIterator', 'Differential',
'DifferentialDirectoryFormat', 'AlignedFASTAFormatMixin',
Expand Down
27 changes: 24 additions & 3 deletions q2_types/feature_data/_deferred_setup/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
TaxonomyFormat, TaxonomyDirectoryFormat, HeaderlessTSVTaxonomyFormat,
HeaderlessTSVTaxonomyDirectoryFormat, TSVTaxonomyFormat,
TSVTaxonomyDirectoryFormat, DNAFASTAFormat, DNASequencesDirectoryFormat,
LinkedDNAFASTAFormat, LinkedDNASequencesDirectoryFormat,
PairedDNASequencesDirectoryFormat, AlignedDNAFASTAFormat,
AlignedDNASequencesDirectoryFormat, DifferentialFormat,
DifferentialDirectoryFormat, FASTAFormat,
Expand All @@ -30,7 +31,8 @@
MixedCaseAlignedRNASequencesDirectoryFormat,
SequenceCharacteristicsDirectoryFormat,
SequenceCharacteristicsFormat,
FeatureData, Taxonomy, Sequence, PairedEndSequence, AlignedSequence,
FeatureData, Taxonomy, Sequence, LinkedSequence, PairedEndSequence,
AlignedSequence,
Differential, ProteinSequence, AlignedProteinSequence, RNASequence,
AlignedRNASequence, PairedEndRNASequence, BLAST6, SequenceCharacteristics)

Expand All @@ -41,7 +43,8 @@
TSVTaxonomyFormat, TSVTaxonomyDirectoryFormat,
HeaderlessTSVTaxonomyFormat, HeaderlessTSVTaxonomyDirectoryFormat,
TaxonomyFormat, TaxonomyDirectoryFormat, FASTAFormat, DNAFASTAFormat,
DNASequencesDirectoryFormat, PairedDNASequencesDirectoryFormat,
LinkedDNAFASTAFormat, DNASequencesDirectoryFormat,
LinkedDNASequencesDirectoryFormat, PairedDNASequencesDirectoryFormat,
AlignedDNAFASTAFormat, AlignedDNASequencesDirectoryFormat,
DifferentialFormat, DifferentialDirectoryFormat, ProteinFASTAFormat,
AlignedProteinFASTAFormat, MixedCaseProteinFASTAFormat,
Expand All @@ -61,6 +64,7 @@
)

plugin.register_semantic_types(FeatureData, Taxonomy, Sequence,
LinkedSequence,
PairedEndSequence, AlignedSequence,
Differential, ProteinSequence,
AlignedProteinSequence, RNASequence,
Expand All @@ -84,6 +88,17 @@
"sequence). Exactly one sequence is associated with each "
"feature identifier."))

plugin.register_artifact_class(
FeatureData[LinkedSequence],
directory_format=LinkedDNASequencesDirectoryFormat,
description=(
"Unaligned DNA sequences associated with a set of feature "
"identifiers. Each sequence represents either a merged pair or an "
"unmerged pair of reads. Unmerged pairs have a single space "
"separating the forward and reverse reads."
)
)

plugin.register_artifact_class(
FeatureData[RNASequence],
directory_format=RNASequencesDirectoryFormat,
Expand All @@ -93,7 +108,13 @@

plugin.register_artifact_class(
FeatureData[PairedEndSequence],
directory_format=PairedDNASequencesDirectoryFormat)
directory_format=PairedDNASequencesDirectoryFormat,
description=(
"Unaligned DNA sequences associated with a set of feature "
"identifiers. One forward sequence and one reverse sequence is "
"associated with each feature identifier."
)
)

plugin.register_artifact_class(
FeatureData[PairedEndRNASequence],
Expand Down
53 changes: 52 additions & 1 deletion q2_types/feature_data/_deferred_setup/_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

from .. import (
TaxonomyFormat, HeaderlessTSVTaxonomyFormat, TSVTaxonomyFormat,
DNAFASTAFormat, PairedDNASequencesDirectoryFormat,
DNAFASTAFormat, LinkedDNAFASTAFormat, PairedDNASequencesDirectoryFormat,
AlignedDNAFASTAFormat, DifferentialFormat, ProteinFASTAFormat,
AlignedProteinFASTAFormat, RNAFASTAFormat,
AlignedRNAFASTAFormat, PairedRNASequencesDirectoryFormat,
Expand Down Expand Up @@ -302,6 +302,16 @@ def _series_to_fasta_format(ff, data, sequence_type="DNA", lowercase=False):
skbio.io.write(sequence, format='fasta', into=f)


def _read_linked_from_fasta(path):
return skbio.read(
path,
format='fasta',
constructor=skbio.Sequence,
lowercase=False,
keep_spaces=True
)


# DNA Transformers
@plugin.register_transformer
def _9(ff: DNAFASTAFormat) -> DNAIterator:
Expand All @@ -316,6 +326,47 @@ def _10(data: DNAIterator) -> DNAFASTAFormat:
return ff


@plugin.register_transformer
def _231(ff: LinkedDNAFASTAFormat) -> DNAIterator:
Comment thread
colinvwood marked this conversation as resolved.
generator = _read_linked_from_fasta(str(ff))
return DNAIterator(generator)


@plugin.register_transformer
def _232(data: DNAIterator) -> LinkedDNAFASTAFormat:
ff = LinkedDNAFASTAFormat()
skbio.io.write(iter(data), format='fasta', into=str(ff))
return ff


@plugin.register_transformer
def _233(ff: LinkedDNAFASTAFormat) -> pd.Series:
data = {}
for sequence in _read_linked_from_fasta(str(ff)):
id_ = sequence.metadata['id']
if id_ in data:
raise ValueError(
"FASTA format sequence IDs must be unique. The following ID "
f"was found more than once: {id_}."
)
data[id_] = sequence

return pd.Series(data)


@plugin.register_transformer
def _234(data: pd.Series) -> LinkedDNAFASTAFormat:
ff = LinkedDNAFASTAFormat()
with ff.open() as fh:
for id_, seq in data.items():
sequence = skbio.Sequence(
str(seq), metadata={'id': id_}, lowercase=False
)
skbio.io.write(sequence, format='fasta', into=fh)
Comment on lines +360 to +365
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One nitpick I guess, is it beneficial to optimize this so we aren't calling write for every line? Idk how clever skbio.io.write is, but generally it seems wasteful to write each line individually instead of one big write.

Copy link
Copy Markdown
Member

@ebolyen ebolyen Apr 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it should write it as fast as any other loop, there's an internal C buffer in Python that is actually queuing up ~8kb blocks and dumping those at once (this is why .flush() exists and .tell() is mostly useless)


return ff


@plugin.register_transformer
def _11(df: PairedDNASequencesDirectoryFormat) -> PairedDNAIterator:
left = df.left_dna_sequences.view(DNAIterator)
Expand Down
17 changes: 17 additions & 0 deletions q2_types/feature_data/_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,17 @@ def __init__(self, *args, **kwargs):
self.alphabet = "ACGTRYKMSWBDHVN"


class LinkedDNAFASTAFormat(DNAFASTAFormat):
'''
Linked sequences are paired end sequences that may contain a single
space between two unmerged read directions. A space is not a valid
FASTA character, so this format is technically not FASTA (fastalmost).
'''
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.alphabet += " "


class AlignedFASTAFormatMixin:
def _turn_into_alignment(self):
self.aligned = True
Expand All @@ -296,6 +307,12 @@ def _validate_line_lengths(
DNASequencesDirectoryFormat = model.SingleFileDirectoryFormat(
'DNASequencesDirectoryFormat', 'dna-sequences.fasta', DNAFASTAFormat)

LinkedDNASequencesDirectoryFormat = model.SingleFileDirectoryFormat(
'LinkedDNASequencesDirectoryFormat',
'linked-dna-sequences.fasta',
LinkedDNAFASTAFormat
)


class MixedCaseDNAFASTAFormat(DNAFASTAFormat):
def __init__(self, *args, **kwargs):
Expand Down
3 changes: 3 additions & 0 deletions q2_types/feature_data/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@

Sequence = SemanticType('Sequence', variant_of=FeatureData.field['type'])

LinkedSequence = SemanticType('LinkedSequence',
variant_of=FeatureData.field['type'])

RNASequence = SemanticType('RNASequence', variant_of=FeatureData.field['type'])

PairedEndSequence = SemanticType('PairedEndSequence',
Expand Down
39 changes: 39 additions & 0 deletions q2_types/feature_data/tests/test_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
TaxonomyFormat, TaxonomyDirectoryFormat, HeaderlessTSVTaxonomyFormat,
HeaderlessTSVTaxonomyDirectoryFormat, TSVTaxonomyFormat,
TSVTaxonomyDirectoryFormat, DNAFASTAFormat, DNASequencesDirectoryFormat,
LinkedDNAFASTAFormat, LinkedDNASequencesDirectoryFormat,
PairedDNASequencesDirectoryFormat, AlignedDNAFASTAFormat,
AlignedDNASequencesDirectoryFormat, DifferentialDirectoryFormat,
ProteinFASTAFormat, AlignedProteinFASTAFormat, FASTAFormat,
Expand Down Expand Up @@ -271,6 +272,44 @@ def test_dna_sequences_directory_format(self):

format.validate()

def test_linked_dna_fasta_format_validate_positive(self):
filepath = os.path.join(
self.temp_dir.name, 'linked-dna-sequences.fasta'
)
with open(filepath, 'w') as fh:
fh.write('>id1\n')
fh.write('ACGTACGT\n')
fh.write('>id2\n')
fh.write('ACGT ACGT\n')

format = LinkedDNAFASTAFormat(filepath, mode='r')
format.validate()

def test_linked_dna_fasta_format_invalid_characters(self):
filepath = os.path.join(self.temp_dir.name, 'linked-invalid.fasta')
with open(filepath, 'w') as fh:
fh.write('>id1\n')
fh.write('ACGT+ACGT\n')

format = LinkedDNAFASTAFormat(filepath, mode='r')
with self.assertRaisesRegex(
ValidationError, "Invalid character '\\+'"
):
format.validate()

def test_linked_dna_sequences_directory_format(self):
filepath = os.path.join(
self.temp_dir.name, 'linked-dna-sequences.fasta'
)
with open(filepath, 'w') as fh:
fh.write('>id1\n')
fh.write('ACGT ACGT\n')

format = LinkedDNASequencesDirectoryFormat(
self.temp_dir.name, mode='r'
)
format.validate()

def test_dna_fasta_format_duplicate_ids(self):
filepath = self.get_data_path('dna-sequences-duplicate-id.fasta')
format = DNAFASTAFormat(filepath, mode='r')
Expand Down
89 changes: 88 additions & 1 deletion q2_types/feature_data/tests/test_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from q2_types.feature_table import BIOMV210Format
from q2_types.feature_data import (
TaxonomyFormat, HeaderlessTSVTaxonomyFormat, TSVTaxonomyFormat,
DNAFASTAFormat, DNAIterator, PairedDNAIterator,
DNAFASTAFormat, LinkedDNAFASTAFormat, DNAIterator, PairedDNAIterator,
ProteinIterator, AlignedProteinIterator,
PairedDNASequencesDirectoryFormat, AlignedDNAFASTAFormat,
DifferentialFormat, AlignedDNAIterator, ProteinFASTAFormat,
Expand All @@ -34,6 +34,7 @@
)
from q2_types.feature_data._deferred_setup._transformers import (
_taxonomy_formats_to_dataframe, _dataframe_to_tsv_taxonomy_format,
_read_linked_from_fasta,
)


Expand Down Expand Up @@ -563,6 +564,92 @@ def test_dna_iterator_to_dna_fasta_format(self):
for act, exp in zip(obs, input):
self.assertEqual(act, exp)

def test_linked_dna_fasta_format_to_dna_iterator(self):
filepath = os.path.join(self.temp_dir.name, 'linked-dna.fasta')
with open(filepath, 'w') as fh:
fh.write('>id1\n')
fh.write('ACGT ACGT\n')
fh.write('>id2\n')
fh.write('ACGT\n')

transformer = self.get_transformer(LinkedDNAFASTAFormat, DNAIterator)
obs = list(transformer(LinkedDNAFASTAFormat(filepath, mode='r')))

self.assertEqual([seq.metadata['id'] for seq in obs], ['id1', 'id2'])
self.assertEqual([str(seq) for seq in obs], ['ACGT ACGT', 'ACGT'])
self.assertTrue(all(type(seq) is skbio.Sequence for seq in obs))

def test_dna_iterator_to_linked_dna_fasta_format(self):
transformer = self.get_transformer(DNAIterator, LinkedDNAFASTAFormat)
input = DNAIterator(iter([
skbio.Sequence('ACGT ACGT', metadata={'id': 'id1'}),
skbio.Sequence('ACGT', metadata={'id': 'id2'}),
]))

obs = transformer(input)
self.assertIsInstance(obs, LinkedDNAFASTAFormat)

reread = list(_read_linked_from_fasta(str(obs)))

self.assertEqual(
[seq.metadata['id'] for seq in reread], ['id1', 'id2']
)
self.assertEqual([str(seq) for seq in reread], ['ACGT ACGT', 'ACGT'])

def test_linked_dnafasta_format_to_series(self):
'''
Tests the LinkedDNAFASTAFormat -> pd.Series transformation.
'''
filepath = os.path.join(self.temp_dir.name, 'linked-dna.fasta')
with open(filepath, 'w') as fh:
fh.write('>id1\n')
fh.write('ACGT ACGT\n')
fh.write('>id2\n')
fh.write('ACGT\n')

transformer = self.get_transformer(LinkedDNAFASTAFormat, pd.Series)
obs = transformer(LinkedDNAFASTAFormat(filepath, mode='r')).astype(str)

index = pd.Index(['id1', 'id2'])
exp = pd.Series(['ACGT ACGT', 'ACGT'], index=index, dtype=object)

assert_series_equal(exp, obs)

def test_series_to_linked_dnafasta_format(self):
'''
Tests the pd.Series -> LinkedDNAFASTAFormat transformation.
'''
transformer = self.get_transformer(pd.Series, LinkedDNAFASTAFormat)

index = pd.Index(['id1', 'id2'])
input = pd.Series(['ACGT ACGT', 'ACGT'], index=index, dtype=object)
obs = transformer(input)

self.assertIsInstance(obs, LinkedDNAFASTAFormat)
reread = list(_read_linked_from_fasta(str(obs)))
self.assertEqual(
[seq.metadata['id'] for seq in reread], ['id1', 'id2']
)
self.assertEqual([str(seq) for seq in reread], ['ACGT ACGT', 'ACGT'])

def test_linked_dnafasta_format_with_duplicate_ids_to_series(self):
'''
Ensures that when transforming from LinkedDNAFASTAFormat to pd.Series,
if a duplicate ID is detected an error is raised.
'''
filepath = os.path.join(
self.temp_dir.name, 'linked-duplicate-ids.fasta'
)
with open(filepath, 'w') as fh:
fh.write('>id1\n')
fh.write('ACGT ACGT\n')
fh.write('>id1\n')
fh.write('ACGT\n')

transformer = self.get_transformer(LinkedDNAFASTAFormat, pd.Series)
with self.assertRaisesRegex(ValueError, 'unique.*id1'):
transformer(LinkedDNAFASTAFormat(filepath, mode='r'))

def test_aln_dna_fasta_format_to_aln_dna_iterator(self):
filename = 'aligned-dna-sequences.fasta'
input, obs = self.transform_format(AlignedDNAFASTAFormat,
Expand Down
Loading
Loading