Skip to content

Commit 6d229c8

Browse files
authored
Merge pull request #1059 from broadinstitute/dp/rasusa-coverage-normalization
Add rasusa-based coverage normalization for assembly polishing
2 parents b2cc4a7 + 9590a8c commit 6d229c8

4 files changed

Lines changed: 178 additions & 30 deletions

File tree

src/viral_ngs/assemble/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,6 @@
1313
from . import skani
1414
from . import spades
1515
from . import freebayes
16+
from . import rasusa
1617
from . import vcf
1718
from . import wgsim

src/viral_ngs/assemble/rasusa.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
'''
2+
Rasusa — randomly subsample sequencing reads or alignments to a
3+
specified coverage depth.
4+
5+
https://github.com/mbhall88/rasusa
6+
'''
7+
8+
import logging
9+
import shutil
10+
import subprocess
11+
12+
from ..core import Tool, PrexistingUnixCommand
13+
14+
_log = logging.getLogger(__name__)
15+
16+
TOOL_NAME = 'rasusa'
17+
18+
19+
class RasusaTool(Tool):
20+
21+
def __init__(self):
22+
install_methods = [PrexistingUnixCommand(shutil.which(TOOL_NAME), require_executability=True)]
23+
Tool.__init__(self, install_methods=install_methods)
24+
25+
def downsample_bam(self, inBam, outBam, coverage, seed=None):
26+
"""Downsample aligned reads in a BAM to a target coverage depth
27+
using ``rasusa aln``.
28+
29+
The input BAM must be coordinate-sorted. The output BAM is
30+
**not** guaranteed to be sorted — the caller is responsible for
31+
sorting and indexing if downstream tools require it.
32+
33+
Args:
34+
inBam: Input BAM file (coordinate-sorted).
35+
outBam: Output BAM file path.
36+
coverage: Target coverage depth (integer).
37+
seed: Random seed for reproducibility (optional).
38+
"""
39+
tool_cmd = [
40+
self.install_and_get_path(), 'aln',
41+
'--coverage', str(coverage),
42+
'-o', outBam,
43+
inBam,
44+
]
45+
if seed is not None:
46+
tool_cmd.extend(['--seed', str(seed)])
47+
_log.debug(' '.join(tool_cmd))
48+
subprocess.check_call(tool_cmd)

src/viral_ngs/assembly.py

Lines changed: 96 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import viral_ngs.core.samtools
3232
import viral_ngs.core.novoalign
3333
import viral_ngs.assemble.freebayes
34+
import viral_ngs.assemble.rasusa
3435

3536
# intra-module (assembly tool wrappers)
3637
import viral_ngs.assemble.vcf
@@ -670,6 +671,8 @@ def refine_assembly(
670671
major_cutoff=0.5,
671672
chr_names=None,
672673
keep_all_reads=False,
674+
max_coverage=None,
675+
rasusa_seed=None,
673676
already_realigned_bam=None,
674677
JVMmemory=None,
675678
threads=None,
@@ -737,36 +740,50 @@ def refine_assembly(
737740
shutil.copyfile(realignBam, outBam)
738741

739742
# Modify original assembly with VCF calls from FreeBayes
740-
tmpVcf = viral_ngs.core.file.mkstempfname('.vcf.gz')
741-
tmpFasta = viral_ngs.core.file.mkstempfname('.fasta')
742-
fb.call(realignBam, deambigFasta, tmpVcf)
743-
if already_realigned_bam is None:
744-
os.unlink(realignBam)
745-
os.unlink(deambigFasta)
746-
name_opts = []
747-
if chr_names:
748-
name_opts = ['--name'] + chr_names
749-
main_vcf_to_fasta(
750-
parser_vcf_to_fasta(argparse.ArgumentParser(
751-
)).parse_args([
752-
tmpVcf,
753-
tmpFasta,
754-
'--trim_ends',
755-
'--min_coverage',
756-
str(min_coverage),
757-
'--major_cutoff',
758-
str(major_cutoff)
759-
] + name_opts)
760-
)
761-
if outVcf:
762-
shutil.copyfile(tmpVcf, outVcf)
763-
if outVcf.endswith('.gz'):
764-
shutil.copyfile(tmpVcf + '.tbi', outVcf + '.tbi')
765-
os.unlink(tmpVcf)
766-
if os.path.exists(tmpVcf + '.tbi'):
767-
os.unlink(tmpVcf + '.tbi')
768-
shutil.copyfile(tmpFasta, outFasta)
769-
os.unlink(tmpFasta)
743+
with viral_ngs.core.file.tempfname('.vcf.gz') as tmpVcf, \
744+
viral_ngs.core.file.tempfname('.fasta') as tmpFasta:
745+
746+
# Optionally downsample coverage for variant calling only
747+
if max_coverage:
748+
rasusa_tool = viral_ngs.assemble.rasusa.RasusaTool()
749+
with viral_ngs.core.file.tempfname('.downsampled.unsorted.bam') as downsampledUnsortedBam, \
750+
viral_ngs.core.file.tempfname('.downsampled.sorted.bam') as downsampledBam:
751+
rasusa_tool.downsample_bam(realignBam, downsampledUnsortedBam, max_coverage, seed=rasusa_seed)
752+
samtools.sort(downsampledUnsortedBam, downsampledBam, threads=threads)
753+
samtools.index(downsampledBam, threads=threads)
754+
fb.call(downsampledBam, deambigFasta, tmpVcf)
755+
# clean up samtools index sidecar
756+
if os.path.isfile(downsampledBam + '.bai'):
757+
os.unlink(downsampledBam + '.bai')
758+
else:
759+
fb.call(realignBam, deambigFasta, tmpVcf)
760+
761+
if already_realigned_bam is None:
762+
os.unlink(realignBam)
763+
os.unlink(deambigFasta)
764+
name_opts = []
765+
if chr_names:
766+
name_opts = ['--name'] + chr_names
767+
main_vcf_to_fasta(
768+
parser_vcf_to_fasta(argparse.ArgumentParser(
769+
)).parse_args([
770+
tmpVcf,
771+
tmpFasta,
772+
'--trim_ends',
773+
'--min_coverage',
774+
str(min_coverage),
775+
'--major_cutoff',
776+
str(major_cutoff)
777+
] + name_opts)
778+
)
779+
if outVcf:
780+
shutil.copyfile(tmpVcf, outVcf)
781+
if outVcf.endswith('.gz'):
782+
shutil.copyfile(tmpVcf + '.tbi', outVcf + '.tbi')
783+
shutil.copyfile(tmpFasta, outFasta)
784+
# clean up tabix index sidecar created by FreeBayes/tabix
785+
if os.path.isfile(tmpVcf + '.tbi'):
786+
os.unlink(tmpVcf + '.tbi')
770787

771788
# Index final output FASTA for Picard, Samtools, and Novoalign
772789
picard_index.execute(outFasta, overwrite=True)
@@ -837,6 +854,22 @@ def parser_refine_assembly(parser=argparse.ArgumentParser()):
837854
action="store_true",
838855
dest="keep_all_reads"
839856
)
857+
parser.add_argument(
858+
'--max_coverage',
859+
default=None,
860+
type=int,
861+
help="""Maximum read coverage depth for variant calling. If specified,
862+
reads are downsampled to this coverage using rasusa before
863+
FreeBayes variant calling. The downsampled BAM is internal only
864+
and not included in any output. [default: %(default)s]"""
865+
)
866+
parser.add_argument(
867+
'--rasusa_seed',
868+
default=None,
869+
type=int,
870+
help="""Random seed for rasusa downsampling reproducibility.
871+
Only used when --max_coverage is set. [default: %(default)s]"""
872+
)
840873
parser.add_argument(
841874
'--JVMmemory',
842875
default='2g',
@@ -856,6 +889,39 @@ def parser_refine_assembly(parser=argparse.ArgumentParser()):
856889
__commands__.append(('refine_assembly', parser_refine_assembly))
857890

858891

892+
def normalize_coverage(inBam, outBam, max_coverage, seed=None, threads=None):
893+
'''Downsample an aligned BAM to a maximum coverage depth using rasusa.
894+
The output BAM is coordinate-sorted and indexed.
895+
'''
896+
samtools = viral_ngs.core.samtools.SamtoolsTool()
897+
rasusa_tool = viral_ngs.assemble.rasusa.RasusaTool()
898+
899+
with viral_ngs.core.file.tempfname('.rasusa.unsorted.bam') as tmpBam:
900+
rasusa_tool.downsample_bam(inBam, tmpBam, max_coverage, seed=seed)
901+
samtools.sort(tmpBam, outBam, threads=threads)
902+
903+
samtools.index(outBam, threads=threads)
904+
return outBam
905+
906+
907+
def parser_normalize_coverage(parser=argparse.ArgumentParser()):
908+
parser.add_argument('inBam', help='Input aligned BAM file, coordinate-sorted.')
909+
parser.add_argument('outBam', help='Output downsampled BAM file, coordinate-sorted and indexed.')
910+
parser.add_argument('max_coverage', type=int, help='Maximum coverage depth to downsample to.')
911+
parser.add_argument(
912+
'--seed',
913+
default=None,
914+
type=int,
915+
help='Random seed for rasusa reproducibility. [default: %(default)s]'
916+
)
917+
viral_ngs.core.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
918+
viral_ngs.core.cmd.attach_main(parser, normalize_coverage, split_args=True)
919+
return parser
920+
921+
922+
__commands__.append(('normalize_coverage', parser_normalize_coverage))
923+
924+
859925
def parser_filter_short_seqs(parser=argparse.ArgumentParser()):
860926
parser.add_argument("inFile", help="input sequence file")
861927
parser.add_argument("minLength", help="minimum length for contig", type=int)

tests/unit/assemble/test_assembly.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,39 @@ def test_empty_input_succeed(self):
123123
self.assertTrue(os.path.getsize(outFasta) == 0)
124124

125125

126+
class TestNormalizeCoverage(TestCaseWithTmp):
127+
'''Test the normalize_coverage command'''
128+
129+
def test_help_parser(self):
130+
parser = viral_ngs.assembly.parser_normalize_coverage(argparse.ArgumentParser())
131+
helpstring = parser.format_help()
132+
self.assertIn('max_coverage', helpstring)
133+
self.assertIn('seed', helpstring)
134+
135+
def test_normalize_coverage_on_aligned_bam(self):
136+
'''normalize_coverage should downsample and produce a sorted, indexed BAM'''
137+
mm2 = viral_ngs.core.minimap2.Minimap2()
138+
samtools = viral_ngs.core.samtools.SamtoolsTool()
139+
140+
inDir = viral_ngs.core.file.get_test_input_path()
141+
inBam = os.path.join(inDir, 'G5012.3.testreads.bam')
142+
refFasta = os.path.join(inDir, 'ebov-makona.fasta')
143+
144+
with viral_ngs.core.file.tempfname('.aligned.bam') as alignedBam, \
145+
viral_ngs.core.file.tempfname('.normalized.bam') as outBam:
146+
mm2.align_bam(inBam, refFasta, alignedBam, options=['-x', 'sr'])
147+
samtools.index(alignedBam)
148+
149+
viral_ngs.assembly.normalize_coverage(alignedBam, outBam, max_coverage=10, seed=42)
150+
151+
self.assertTrue(os.path.isfile(outBam))
152+
self.assertGreater(os.path.getsize(outBam), 0)
153+
# index file should also exist
154+
self.assertTrue(os.path.isfile(outBam + '.bai'))
155+
# output should have fewer or equal reads compared to input
156+
self.assertLessEqual(samtools.count(outBam), samtools.count(alignedBam))
157+
158+
126159
class TestAssembleSpades(TestCaseWithTmp):
127160
''' Test the assemble_spades command (no validation of output) '''
128161

0 commit comments

Comments
 (0)