nf-core · apeltzer · Apr 10, 2019 · Apr 9, 2019 · Apr 9, 2019 · Apr 9, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -46,6 +46,8 @@ script:
   - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --skip_trim --saveReference
    # Run the basic pipeline with paired end data without adapterRemoval
   - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --skip_adapterremoval --saveReference
+   # Run the basic pipeline with output unmapped reads as fastq
+  - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --unmap
   # Run the same pipeline testing optional step: fastp, complexity 
   - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --complexity_filter --bwa_index results/reference_genome/bwa_index/bwa_index/
   # Test BAM Trimming

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ### `Added`
 
+* [#189](https://github.com/nf-core/eager/pull/189) - Outputing unmapped reads in a fastq files with the --unmap flag
+
 ### `Fixed`
 * [#172](https://github.com/nf-core/eager/pull/152) - DamageProfiler errors [won't crash entire pipeline anymore](https://github.com/nf-core/eager/issues/171)
 

diff --git a/bin/extract_map_reads.py b/bin/extract_map_reads.py
@@ -0,0 +1,197 @@
+#!/usr/bin/env python3
+
+import argparse
+import multiprocessing
+import pysam
+from functools import partial
+import gzip
+import sys
+
+
+def _get_args():
+    '''This function parses and return arguments passed in'''
+    parser = argparse.ArgumentParser(
+        prog='extract_mapped_reads',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=f'''
+Remove mapped in bam file from fastq files
+        ''')
+    parser.add_argument('bam_file', help="path to bam file")
+    parser.add_argument('fwd', help='path to forward fastq file')
+    parser.add_argument(
+        '-2',
+        dest="rev",
+        default=None,
+        help="path to forward fastq file")
+    parser.add_argument(
+        '-of',
+        dest="out_fwd",
+        default=None,
+        help="path to forward output fastq file")
+    parser.add_argument(
+        '-or',
+        dest="out_rev",
+        default=None,
+        help="path to forward output fastq file")
+    parser.add_argument(
+        '-p',
+        dest='process',
+        default=4,
+        help='Number of parallel processes'
+    )
+
+    args = parser.parse_args()
+
+    bam = args.bam_file
+    in_fwd = args.fwd
+    in_rev = args.rev
+    out_fwd = args.out_fwd
+    out_rev = args.out_rev
+    proc = int(args.process)
+
+    return(bam, in_fwd, in_rev, out_fwd, out_rev, proc)
+
+
+def extract_mapped_chr(chr, bam):
+    """
+    Get mapped reads per chromosome
+    INPUT:
+    - chr(str): chromosome
+    - bam(str): bamfile path
+    OUTPUT:
+    - res(list): list of mapped reads (str) name per chromosome
+    """
+    res = []
+    bamfile = pysam.AlignmentFile(bam, "rb")
+    reads = bamfile.fetch(chr, multiple_iterators=True)
+    for read in reads:
+        res.append(read.query_name)
+    return(res)
+
+
+def extract_mapped(bam, processes):
+    """
+    Get mapped reads in parallel
+    INPUT:
+    - bam(str): bamfile path
+    OUTPUT:
+    - result(list) list of mapped reads name (str)
+    """
+    try:
+        bamfile = pysam.AlignmentFile(bam, "rb")
+        chrs = bamfile.references
+    except ValueError as e:
+        print(e)
+    extract_mapped_chr_partial = partial(extract_mapped_chr, bam=bam)
+    p = multiprocessing.Pool(processes)
+    res = p.map(extract_mapped_chr_partial, chrs)
+    p.close()
+    p.join()
+    result = [i for ares in res for i in ares]
+    return(result)
+
+
+def parse_fq(fq):
+    """
+    Parse a FASTQ file
+    INPUT:
+    - fq(str): path to fastq file
+    OUTPUT:
+    - fqd(dict): dictionary with read names as keys, seq and quality as values
+        in a list
+    """
+
+    def get_fq_reads(allreads):
+        fqd = {}
+        myflag = True
+        for line in allreads:
+            line = line.decode('utf-8').rstrip()
+            if myflag == True:
+                instrument = line.split()[0].split(":")[0]
+                myflag = False
+            if line.startswith(instrument):
+                seqname = line[1:].split()[0]
+                fqd[seqname] = []
+                continue
+            else:
+                fqd[seqname].append(line)
+        return(fqd)
+
+    if fq.endswith('.gz'):
+        with gzip.open(fq, 'rb') as allreads:
+            fqd = get_fq_reads(allreads)
+    else:
+        with open(fq, 'r') as allreads:
+            fqd = get_fq_reads(allreads)
+
+    return(fqd)
+
+
+def remove_mapped(fq_dict, mapped_reads):
+    """
+    Remove mapped reads from dictionary of fastq reads
+    INPUT:
+    - fq_dict(dict) dictionary with read names as keys, seq and quality as values
+        in a list
+    - mapped_reads(list) list of mapped reads
+    OUTPUT:
+    - ufqd(dict) dictionary with unmapped read names as keys, seq and quality as values
+        in a list
+    """
+    ufqd = {}
+    unmap = [i for i in list(fq_dict.keys()) if i not in mapped_reads]
+    # print(unmap)
+    for r in unmap:
+        ufqd[r] = fq_dict[r]
+
+    return(ufqd)
+
+
+def write_fq(fq_dict, fname):
+    """
+    Write to fastq file
+    INPUT:
+    - fq_dict(dict) dictionary with unmapped read names as keys, seq and quality as values
+        in a list
+    - fname(string) Path to output fastq file
+    """
+
+    if fname.endswith('.gz'):
+        with gzip.open(fname, 'wb') as f:
+            for k in list(fq_dict.keys()):
+                f.write(f"{k}\n".encode())
+                for i in fq_dict[k]:
+                    f.write(f"{i}\n".encode())
+
+    else:
+        with open(fname, 'w') as f:
+            for k in list(fq_dict.keys()):
+                f.write(f"{k}\n")
+                for i in fq_dict[k]:
+                    f.write(f"{i}\n")
+
+
+if __name__ == "__main__":
+    BAM, IN_FWD, IN_REV, OUT_FWD, OUT_REV, PROC = _get_args()
+
+    if IN_REV and not OUT_REV:
+        print('You specified an input reverse fastq, but no output reverse fastq')
+        sys.exit(1)
+
+    if OUT_FWD == None:
+        out_fwd = f"{IN_FWD.split('/')[-1].split('.')[0]}.r1.fq.gz"
+    else:
+        out_fwd = OUT_FWD
+
+    mapped_reads = extract_mapped(BAM, PROC)
+    fwd_dict = parse_fq(IN_FWD)
+    fwd_unmap = remove_mapped(fwd_dict, mapped_reads)
+    write_fq(fwd_unmap, out_fwd)
+    if IN_REV:
+        if OUT_REV == None:
+            out_rev = f"{IN_REV.split('/')[-1].split('.')[0]}.r2.fq.gz"
+        else:
+            out_rev = OUT_REV
+        rev_dict = parse_fq(IN_REV)
+        rev_unmap = remove_mapped(rev_dict, mapped_reads)
+        write_fq(rev_unmap, out_rev)
diff --git a/conf/base.config b/conf/base.config
@@ -64,6 +64,11 @@ process {
   withName: damageprofiler {
     errorStrategy = 'ignore'
   }
+
+  withName: extract_unmapped_reads {
+    cpus = { check_max(8 * task.attempt, 'cpus') }
+    memory = { check_max( 8.GB * task.attempt, 'memory' ) }
+  }
 }
 
 params {

diff --git a/environment.yml b/environment.yml
@@ -28,4 +28,6 @@ dependencies:
   - bioconda::fastp=0.19.7
   - bioconda::bamutil=1.0.14
   - bioconda::mtnucratio=0.5
+  - pysam=0.15.2
+  - python=3.6
   #Missing Schmutzi,snpAD
diff --git a/main.nf b/main.nf
@@ -105,6 +105,7 @@ def helpMessage() {
       --bamutils_softclip           Use softclip instead of hard masking
 
     Other options:
+      --unmap                       Create fastq files from unaligned reads     
       --outdir                      The output directory where the results will be saved
       --email                       Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits
       --plaintext_email             Receive plain text emails rather than HTML
@@ -212,6 +213,10 @@ params.bamutils_clip_left = 1
 params.bamutils_clip_right = 1 
 params.bamutils_softclip = false 
 
+//unmap
+
+params.unmap = false
+
 
 
 
@@ -295,14 +300,14 @@ if( params.readPaths ){
             .from( params.readPaths )
             .map { row -> [ row[0], [ file( row[1][0] ) ] ] }
             .ifEmpty { exit 1, "params.readPaths or params.bams was empty - no input files supplied!" }
-            .into { ch_read_files_clip; ch_read_files_fastqc; ch_read_files_complexity_filter_poly_g }
+            .into { ch_read_files_clip; ch_read_files_fastqc; ch_read_files_complexity_filter_poly_g ; ch_read_unmap}
             ch_bam_to_fastq_convert = Channel.empty()
     } else if (!params.bam){
         Channel
             .from( params.readPaths )
             .map { row -> [ row[0], [ file( row[1][0] ), file( row[1][1] ) ] ] }
             .ifEmpty { exit 1, "params.readPaths or params.bams was empty - no input files supplied!" }
-            .into { ch_read_files_clip; ch_read_files_fastqc; ch_read_files_complexity_filter_poly_g }
+            .into { ch_read_files_clip; ch_read_files_fastqc; ch_read_files_complexity_filter_poly_g; ch_read_unmap }
             ch_bam_to_fastq_convert = Channel.empty()
     } else {
         Channel
@@ -322,7 +327,7 @@ if( params.readPaths ){
         .fromFilePairs( params.reads, size: params.singleEnd ? 1 : 2 )
         .ifEmpty { exit 1, "Cannot find any reads matching: ${params.reads}\nNB: Path needs" +
             "to be enclosed in quotes!\nNB: Path requires at least one * wildcard!\nIf this is single-end data, please specify --singleEnd on the command line." }
-        .into { ch_read_files_clip; ch_read_files_fastqc; ch_read_files_complexity_filter_poly_g }
+        .into { ch_read_files_clip; ch_read_files_fastqc; ch_read_files_complexity_filter_poly_g; ch_read_unmap }
         ch_bam_to_fastq_convert = Channel.empty()
 } else {
      Channel
@@ -355,6 +360,7 @@ if(params.bwa_index) summary['BWA Index'] = params.bwa_index
 summary['Data Type']    = params.singleEnd ? 'Single-End' : 'Paired-End'
 summary['Skip Collapsing'] = params.skip_collapse ? 'Yes' : 'No'
 summary['Skip Trimming']  = params.skip_trim  ? 'Yes' : 'No' 
+summary['Output unmapped reads as fastq'] = params.unmap ? 'Yes' : 'No'
 summary['Max Memory']   = params.max_memory
 summary['Max CPUs']     = params.max_cpus
 summary['Max Time']     = params.max_time
@@ -857,7 +863,7 @@ process samtools_filter {
     file bam from ch_mapped_reads_filter.mix(ch_mapped_reads_filter_cm,ch_bwamem_mapped_reads_filter)
 
     output:
-    file "*filtered.bam" into ch_bam_filtered_qualimap, ch_bam_filtered_dedup, ch_bam_filtered_markdup, ch_bam_filtered_pmdtools, ch_bam_filtered_angsd, ch_bam_filtered_gatk
+    file "*filtered.bam" into ch_bam_filtered_qualimap, ch_bam_filtered_dedup, ch_bam_filtered_markdup, ch_bam_filtered_pmdtools, ch_bam_filtered_angsd, ch_bam_filtered_gatk, ch_bam_unmap
     file "*.fastq.gz" optional true
     file "*.unmapped.bam" optional true
     file "*.{bai,csi}"
@@ -897,6 +903,38 @@ process samtools_filter {
     }  
 }
 
+process extract_unmapped_reads {
+    tag "${bam.baseName}"
+    publishDir "${params.outdir}/samtools/unmapped_fastq", mode: 'copy'
+
+    when: params.unmap
+
+    input: 
+    set val(name), file(fq) from ch_read_unmap
+    file bam from ch_bam_unmap
+
+    output:
+    file "*.fq.gz" into unmapped_fq_ch
+
+
+    script:
+    if (params.pairedEnd) {
+        out_fwd = bam.baseName+'.unmapped.fq.gz'
+        """
+        samtools index $bam
+        extract_map_reads.py $bam ${fq[0]} -of $out_fwd
+        """
+    } else {
+        out_fwd = bam.baseName+'.unmapped.r1.fq.gz'
+        out_rev = bam.baseName+'.unmapped.r2.fq.gz'
+        """
+        samtools index $bam
+        extract_map_reads.py $bam ${fq[0]} -of $out_fwd -or $out_rev -p
+        """ 
+    }
+
+}
+
 
 /*
 Step 6: DeDup / MarkDuplicates