Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ script:
- nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --skip_trim --saveReference
# Run the basic pipeline with paired end data without adapterRemoval
- nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --skip_adapterremoval --saveReference
# Run the basic pipeline with output unmapped reads as fastq
- nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --unmap
# Run the same pipeline testing optional step: fastp, complexity
- nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --complexity_filter --bwa_index results/reference_genome/bwa_index/bwa_index/
# Test BAM Trimming
Expand Down
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

### `Added`

* [#189](https://github.com/nf-core/eager/pull/189) - Outputing unmapped reads in a fastq files with the --unmap flag

### `Fixed`
* [#172](https://github.com/nf-core/eager/pull/152) - DamageProfiler errors [won't crash entire pipeline anymore](https://github.com/nf-core/eager/issues/171)

Expand Down
197 changes: 197 additions & 0 deletions bin/extract_map_reads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
#!/usr/bin/env python3

import argparse
import multiprocessing
import pysam
from functools import partial
import gzip
import sys


def _get_args():
'''This function parses and return arguments passed in'''
parser = argparse.ArgumentParser(
prog='extract_mapped_reads',
formatter_class=argparse.RawDescriptionHelpFormatter,
description=f'''
Remove mapped in bam file from fastq files
''')
parser.add_argument('bam_file', help="path to bam file")
parser.add_argument('fwd', help='path to forward fastq file')
parser.add_argument(
'-2',
dest="rev",
default=None,
help="path to forward fastq file")
parser.add_argument(
'-of',
dest="out_fwd",
default=None,
help="path to forward output fastq file")
parser.add_argument(
'-or',
dest="out_rev",
default=None,
help="path to forward output fastq file")
parser.add_argument(
'-p',
dest='process',
default=4,
help='Number of parallel processes'
)

args = parser.parse_args()

bam = args.bam_file
in_fwd = args.fwd
in_rev = args.rev
out_fwd = args.out_fwd
out_rev = args.out_rev
proc = int(args.process)

return(bam, in_fwd, in_rev, out_fwd, out_rev, proc)


def extract_mapped_chr(chr, bam):
"""
Get mapped reads per chromosome
INPUT:
- chr(str): chromosome
- bam(str): bamfile path
OUTPUT:
- res(list): list of mapped reads (str) name per chromosome
"""
res = []
bamfile = pysam.AlignmentFile(bam, "rb")
reads = bamfile.fetch(chr, multiple_iterators=True)
for read in reads:
res.append(read.query_name)
return(res)


def extract_mapped(bam, processes):
"""
Get mapped reads in parallel
INPUT:
- bam(str): bamfile path
OUTPUT:
- result(list) list of mapped reads name (str)
"""
try:
bamfile = pysam.AlignmentFile(bam, "rb")
chrs = bamfile.references
except ValueError as e:
print(e)
extract_mapped_chr_partial = partial(extract_mapped_chr, bam=bam)
p = multiprocessing.Pool(processes)
res = p.map(extract_mapped_chr_partial, chrs)
p.close()
p.join()
result = [i for ares in res for i in ares]
return(result)


def parse_fq(fq):
"""
Parse a FASTQ file
INPUT:
- fq(str): path to fastq file
OUTPUT:
- fqd(dict): dictionary with read names as keys, seq and quality as values
in a list
"""

def get_fq_reads(allreads):
fqd = {}
myflag = True
for line in allreads:
line = line.decode('utf-8').rstrip()
if myflag == True:
instrument = line.split()[0].split(":")[0]
myflag = False
if line.startswith(instrument):
seqname = line[1:].split()[0]
fqd[seqname] = []
continue
else:
fqd[seqname].append(line)
return(fqd)

if fq.endswith('.gz'):
with gzip.open(fq, 'rb') as allreads:
fqd = get_fq_reads(allreads)
else:
with open(fq, 'r') as allreads:
fqd = get_fq_reads(allreads)

return(fqd)


def remove_mapped(fq_dict, mapped_reads):
"""
Remove mapped reads from dictionary of fastq reads
INPUT:
- fq_dict(dict) dictionary with read names as keys, seq and quality as values
in a list
- mapped_reads(list) list of mapped reads
OUTPUT:
- ufqd(dict) dictionary with unmapped read names as keys, seq and quality as values
in a list
"""
ufqd = {}
unmap = [i for i in list(fq_dict.keys()) if i not in mapped_reads]
# print(unmap)
for r in unmap:
ufqd[r] = fq_dict[r]

return(ufqd)


def write_fq(fq_dict, fname):
"""
Write to fastq file
INPUT:
- fq_dict(dict) dictionary with unmapped read names as keys, seq and quality as values
in a list
- fname(string) Path to output fastq file
"""

if fname.endswith('.gz'):
with gzip.open(fname, 'wb') as f:
for k in list(fq_dict.keys()):
f.write(f"{k}\n".encode())
for i in fq_dict[k]:
f.write(f"{i}\n".encode())

else:
with open(fname, 'w') as f:
for k in list(fq_dict.keys()):
f.write(f"{k}\n")
for i in fq_dict[k]:
f.write(f"{i}\n")


if __name__ == "__main__":
BAM, IN_FWD, IN_REV, OUT_FWD, OUT_REV, PROC = _get_args()

if IN_REV and not OUT_REV:
print('You specified an input reverse fastq, but no output reverse fastq')
sys.exit(1)

if OUT_FWD == None:
out_fwd = f"{IN_FWD.split('/')[-1].split('.')[0]}.r1.fq.gz"
else:
out_fwd = OUT_FWD

mapped_reads = extract_mapped(BAM, PROC)
fwd_dict = parse_fq(IN_FWD)
fwd_unmap = remove_mapped(fwd_dict, mapped_reads)
write_fq(fwd_unmap, out_fwd)
if IN_REV:
if OUT_REV == None:
out_rev = f"{IN_REV.split('/')[-1].split('.')[0]}.r2.fq.gz"
else:
out_rev = OUT_REV
rev_dict = parse_fq(IN_REV)
rev_unmap = remove_mapped(rev_dict, mapped_reads)
write_fq(rev_unmap, out_rev)
5 changes: 5 additions & 0 deletions conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ process {
withName: damageprofiler {
errorStrategy = 'ignore'
}

withName: extract_unmapped_reads {
cpus = { check_max(8 * task.attempt, 'cpus') }
memory = { check_max( 8.GB * task.attempt, 'memory' ) }
}
}

params {
Expand Down
2 changes: 2 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,6 @@ dependencies:
- bioconda::fastp=0.19.7
- bioconda::bamutil=1.0.14
- bioconda::mtnucratio=0.5
- pysam=0.15.2
- python=3.6
#Missing Schmutzi,snpAD
46 changes: 42 additions & 4 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ def helpMessage() {
--bamutils_softclip Use softclip instead of hard masking

Other options:
--unmap Create fastq files from unaligned reads
--outdir The output directory where the results will be saved
--email Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits
--plaintext_email Receive plain text emails rather than HTML
Expand Down Expand Up @@ -212,6 +213,10 @@ params.bamutils_clip_left = 1
params.bamutils_clip_right = 1
params.bamutils_softclip = false

//unmap

params.unmap = false




Expand Down Expand Up @@ -295,14 +300,14 @@ if( params.readPaths ){
.from( params.readPaths )
.map { row -> [ row[0], [ file( row[1][0] ) ] ] }
.ifEmpty { exit 1, "params.readPaths or params.bams was empty - no input files supplied!" }
.into { ch_read_files_clip; ch_read_files_fastqc; ch_read_files_complexity_filter_poly_g }
.into { ch_read_files_clip; ch_read_files_fastqc; ch_read_files_complexity_filter_poly_g ; ch_read_unmap}
ch_bam_to_fastq_convert = Channel.empty()
} else if (!params.bam){
Channel
.from( params.readPaths )
.map { row -> [ row[0], [ file( row[1][0] ), file( row[1][1] ) ] ] }
.ifEmpty { exit 1, "params.readPaths or params.bams was empty - no input files supplied!" }
.into { ch_read_files_clip; ch_read_files_fastqc; ch_read_files_complexity_filter_poly_g }
.into { ch_read_files_clip; ch_read_files_fastqc; ch_read_files_complexity_filter_poly_g; ch_read_unmap }
ch_bam_to_fastq_convert = Channel.empty()
} else {
Channel
Expand All @@ -322,7 +327,7 @@ if( params.readPaths ){
.fromFilePairs( params.reads, size: params.singleEnd ? 1 : 2 )
.ifEmpty { exit 1, "Cannot find any reads matching: ${params.reads}\nNB: Path needs" +
"to be enclosed in quotes!\nNB: Path requires at least one * wildcard!\nIf this is single-end data, please specify --singleEnd on the command line." }
.into { ch_read_files_clip; ch_read_files_fastqc; ch_read_files_complexity_filter_poly_g }
.into { ch_read_files_clip; ch_read_files_fastqc; ch_read_files_complexity_filter_poly_g; ch_read_unmap }
ch_bam_to_fastq_convert = Channel.empty()
} else {
Channel
Expand Down Expand Up @@ -355,6 +360,7 @@ if(params.bwa_index) summary['BWA Index'] = params.bwa_index
summary['Data Type'] = params.singleEnd ? 'Single-End' : 'Paired-End'
summary['Skip Collapsing'] = params.skip_collapse ? 'Yes' : 'No'
summary['Skip Trimming'] = params.skip_trim ? 'Yes' : 'No'
summary['Output unmapped reads as fastq'] = params.unmap ? 'Yes' : 'No'
summary['Max Memory'] = params.max_memory
summary['Max CPUs'] = params.max_cpus
summary['Max Time'] = params.max_time
Expand Down Expand Up @@ -857,7 +863,7 @@ process samtools_filter {
file bam from ch_mapped_reads_filter.mix(ch_mapped_reads_filter_cm,ch_bwamem_mapped_reads_filter)

output:
file "*filtered.bam" into ch_bam_filtered_qualimap, ch_bam_filtered_dedup, ch_bam_filtered_markdup, ch_bam_filtered_pmdtools, ch_bam_filtered_angsd, ch_bam_filtered_gatk
file "*filtered.bam" into ch_bam_filtered_qualimap, ch_bam_filtered_dedup, ch_bam_filtered_markdup, ch_bam_filtered_pmdtools, ch_bam_filtered_angsd, ch_bam_filtered_gatk, ch_bam_unmap
file "*.fastq.gz" optional true
file "*.unmapped.bam" optional true
file "*.{bai,csi}"
Expand Down Expand Up @@ -897,6 +903,38 @@ process samtools_filter {
}
}

process extract_unmapped_reads {
tag "${bam.baseName}"
publishDir "${params.outdir}/samtools/unmapped_fastq", mode: 'copy'

when: params.unmap

input:
set val(name), file(fq) from ch_read_unmap
file bam from ch_bam_unmap

output:
file "*.fq.gz" into unmapped_fq_ch


script:
if (params.pairedEnd) {
out_fwd = bam.baseName+'.unmapped.fq.gz'
"""
samtools index $bam
extract_map_reads.py $bam ${fq[0]} -of $out_fwd
"""
} else {
out_fwd = bam.baseName+'.unmapped.r1.fq.gz'
out_rev = bam.baseName+'.unmapped.r2.fq.gz'
"""
samtools index $bam
extract_map_reads.py $bam ${fq[0]} -of $out_fwd -or $out_rev -p
"""
}

}


/*
Step 6: DeDup / MarkDuplicates
Expand Down