3131import viral_ngs .core .samtools
3232import viral_ngs .core .novoalign
3333import viral_ngs .assemble .freebayes
34+ import viral_ngs .assemble .rasusa
3435
3536# intra-module (assembly tool wrappers)
3637import viral_ngs .assemble .vcf
@@ -670,6 +671,8 @@ def refine_assembly(
670671 major_cutoff = 0.5 ,
671672 chr_names = None ,
672673 keep_all_reads = False ,
674+ max_coverage = None ,
675+ rasusa_seed = None ,
673676 already_realigned_bam = None ,
674677 JVMmemory = None ,
675678 threads = None ,
@@ -737,36 +740,50 @@ def refine_assembly(
737740 shutil .copyfile (realignBam , outBam )
738741
739742 # Modify original assembly with VCF calls from FreeBayes
740- tmpVcf = viral_ngs .core .file .mkstempfname ('.vcf.gz' )
741- tmpFasta = viral_ngs .core .file .mkstempfname ('.fasta' )
742- fb .call (realignBam , deambigFasta , tmpVcf )
743- if already_realigned_bam is None :
744- os .unlink (realignBam )
745- os .unlink (deambigFasta )
746- name_opts = []
747- if chr_names :
748- name_opts = ['--name' ] + chr_names
749- main_vcf_to_fasta (
750- parser_vcf_to_fasta (argparse .ArgumentParser (
751- )).parse_args ([
752- tmpVcf ,
753- tmpFasta ,
754- '--trim_ends' ,
755- '--min_coverage' ,
756- str (min_coverage ),
757- '--major_cutoff' ,
758- str (major_cutoff )
759- ] + name_opts )
760- )
761- if outVcf :
762- shutil .copyfile (tmpVcf , outVcf )
763- if outVcf .endswith ('.gz' ):
764- shutil .copyfile (tmpVcf + '.tbi' , outVcf + '.tbi' )
765- os .unlink (tmpVcf )
766- if os .path .exists (tmpVcf + '.tbi' ):
767- os .unlink (tmpVcf + '.tbi' )
768- shutil .copyfile (tmpFasta , outFasta )
769- os .unlink (tmpFasta )
743+ with viral_ngs .core .file .tempfname ('.vcf.gz' ) as tmpVcf , \
744+ viral_ngs .core .file .tempfname ('.fasta' ) as tmpFasta :
745+
746+ # Optionally downsample coverage for variant calling only
747+ if max_coverage :
748+ rasusa_tool = viral_ngs .assemble .rasusa .RasusaTool ()
749+ with viral_ngs .core .file .tempfname ('.downsampled.unsorted.bam' ) as downsampledUnsortedBam , \
750+ viral_ngs .core .file .tempfname ('.downsampled.sorted.bam' ) as downsampledBam :
751+ rasusa_tool .downsample_bam (realignBam , downsampledUnsortedBam , max_coverage , seed = rasusa_seed )
752+ samtools .sort (downsampledUnsortedBam , downsampledBam , threads = threads )
753+ samtools .index (downsampledBam , threads = threads )
754+ fb .call (downsampledBam , deambigFasta , tmpVcf )
755+ # clean up samtools index sidecar
756+ if os .path .isfile (downsampledBam + '.bai' ):
757+ os .unlink (downsampledBam + '.bai' )
758+ else :
759+ fb .call (realignBam , deambigFasta , tmpVcf )
760+
761+ if already_realigned_bam is None :
762+ os .unlink (realignBam )
763+ os .unlink (deambigFasta )
764+ name_opts = []
765+ if chr_names :
766+ name_opts = ['--name' ] + chr_names
767+ main_vcf_to_fasta (
768+ parser_vcf_to_fasta (argparse .ArgumentParser (
769+ )).parse_args ([
770+ tmpVcf ,
771+ tmpFasta ,
772+ '--trim_ends' ,
773+ '--min_coverage' ,
774+ str (min_coverage ),
775+ '--major_cutoff' ,
776+ str (major_cutoff )
777+ ] + name_opts )
778+ )
779+ if outVcf :
780+ shutil .copyfile (tmpVcf , outVcf )
781+ if outVcf .endswith ('.gz' ):
782+ shutil .copyfile (tmpVcf + '.tbi' , outVcf + '.tbi' )
783+ shutil .copyfile (tmpFasta , outFasta )
784+ # clean up tabix index sidecar created by FreeBayes/tabix
785+ if os .path .isfile (tmpVcf + '.tbi' ):
786+ os .unlink (tmpVcf + '.tbi' )
770787
771788 # Index final output FASTA for Picard, Samtools, and Novoalign
772789 picard_index .execute (outFasta , overwrite = True )
@@ -837,6 +854,22 @@ def parser_refine_assembly(parser=argparse.ArgumentParser()):
837854 action = "store_true" ,
838855 dest = "keep_all_reads"
839856 )
857+ parser .add_argument (
858+ '--max_coverage' ,
859+ default = None ,
860+ type = int ,
861+ help = """Maximum read coverage depth for variant calling. If specified,
862+ reads are downsampled to this coverage using rasusa before
863+ FreeBayes variant calling. The downsampled BAM is internal only
864+ and not included in any output. [default: %(default)s]"""
865+ )
866+ parser .add_argument (
867+ '--rasusa_seed' ,
868+ default = None ,
869+ type = int ,
870+ help = """Random seed for rasusa downsampling reproducibility.
871+ Only used when --max_coverage is set. [default: %(default)s]"""
872+ )
840873 parser .add_argument (
841874 '--JVMmemory' ,
842875 default = '2g' ,
@@ -856,6 +889,39 @@ def parser_refine_assembly(parser=argparse.ArgumentParser()):
856889__commands__ .append (('refine_assembly' , parser_refine_assembly ))
857890
858891
892+ def normalize_coverage (inBam , outBam , max_coverage , seed = None , threads = None ):
893+ '''Downsample an aligned BAM to a maximum coverage depth using rasusa.
894+ The output BAM is coordinate-sorted and indexed.
895+ '''
896+ samtools = viral_ngs .core .samtools .SamtoolsTool ()
897+ rasusa_tool = viral_ngs .assemble .rasusa .RasusaTool ()
898+
899+ with viral_ngs .core .file .tempfname ('.rasusa.unsorted.bam' ) as tmpBam :
900+ rasusa_tool .downsample_bam (inBam , tmpBam , max_coverage , seed = seed )
901+ samtools .sort (tmpBam , outBam , threads = threads )
902+
903+ samtools .index (outBam , threads = threads )
904+ return outBam
905+
906+
907+ def parser_normalize_coverage (parser = argparse .ArgumentParser ()):
908+ parser .add_argument ('inBam' , help = 'Input aligned BAM file, coordinate-sorted.' )
909+ parser .add_argument ('outBam' , help = 'Output downsampled BAM file, coordinate-sorted and indexed.' )
910+ parser .add_argument ('max_coverage' , type = int , help = 'Maximum coverage depth to downsample to.' )
911+ parser .add_argument (
912+ '--seed' ,
913+ default = None ,
914+ type = int ,
915+ help = 'Random seed for rasusa reproducibility. [default: %(default)s]'
916+ )
917+ viral_ngs .core .cmd .common_args (parser , (('threads' , None ), ('loglevel' , None ), ('version' , None ), ('tmp_dir' , None )))
918+ viral_ngs .core .cmd .attach_main (parser , normalize_coverage , split_args = True )
919+ return parser
920+
921+
922+ __commands__ .append (('normalize_coverage' , parser_normalize_coverage ))
923+
924+
859925def parser_filter_short_seqs (parser = argparse .ArgumentParser ()):
860926 parser .add_argument ("inFile" , help = "input sequence file" )
861927 parser .add_argument ("minLength" , help = "minimum length for contig" , type = int )
0 commit comments