Merge pull request #189 from maxibor/dev

apeltzer · web-flow · commit 4d2f0622440b · 2019-04-10T14:09:46.000+02:00
Fastq from unmapped reads
diff --git a/.travis.yml b/.travis.yml
@@ -50,6 +50,8 @@ script:
   - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --skip_trim --saveReference
    # Run the basic pipeline with paired end data without adapterRemoval
   - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --skip_adapterremoval --saveReference
+   # Run the basic pipeline with output unmapped reads as fastq
+  - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --strip_input_fastq
   # Run the same pipeline testing optional step: fastp, complexity 
   - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --complexity_filter --bwa_index results/reference_genome/bwa_index/bwa_index/
   # Test BAM Trimming
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ### `Added`
 
+* [#189](https://github.com/nf-core/eager/pull/189) - Outputing unmapped reads in a fastq files with the --strip_input_fastq flag
 * [#186](https://github.com/nf-core/eager/pull/186) - Make FastQC skipping [possible]
 /(https://github.com/nf-core/eager/issues/182)
 * Merged in [nf-core/tools](https://github.com/nf-core/tools) release V1.6 template changes  
diff --git a/bin/extract_map_reads.py b/bin/extract_map_reads.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+
+import argparse
+import multiprocessing
+import pysam
+from functools import partial
+import gzip
+import sys
+
+
+def _get_args():
+    '''This function parses and return arguments passed in'''
+    parser = argparse.ArgumentParser(
+        prog='extract_mapped_reads',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=f'''
+Remove mapped in bam file from fastq files
+        ''')
+    parser.add_argument('bam_file', help="path to bam file")
+    parser.add_argument('fwd', help='path to forward fastq file')
+    parser.add_argument(
+        '-2',
+        dest="rev",
+        default=None,
+        help="path to forward fastq file")
+    parser.add_argument(
+        '-of',
+        dest="out_fwd",
+        default=None,
+        help="path to forward output fastq file")
+    parser.add_argument(
+        '-or',
+        dest="out_rev",
+        default=None,
+        help="path to forward output fastq file")
+    parser.add_argument(
+        '-m',
+        dest='mode',
+        default='strip',
+        help='Read removal mode: remove reads (strip) or replace sequence by N (replace)'
+    )
+    parser.add_argument(
+        '-p',
+        dest='process',
+        default=4,
+        help='Number of parallel processes'
+    )
+
+    args = parser.parse_args()
+
+    bam = args.bam_file
+    in_fwd = args.fwd
+    in_rev = args.rev
+    out_fwd = args.out_fwd
+    out_rev = args.out_rev
+    mode = args.mode
+    proc = int(args.process)
+
+    return(bam, in_fwd, in_rev, out_fwd, out_rev, mode, proc)
+
+
+def extract_mapped_chr(chr, bam):
+    """
+    Get mapped reads per chromosome
+    INPUT:
+    - chr(str): chromosome
+    - bam(str): bamfile path
+    OUTPUT:
+    - res(list): list of mapped reads (str) name per chromosome
+    """
+    res = []
+    bamfile = pysam.AlignmentFile(bam, "rb")
+    reads = bamfile.fetch(chr, multiple_iterators=True)
+    for read in reads:
+        res.append(read.query_name)
+    return(res)
+
+
+def extract_mapped(bam, processes):
+    """
+    Get mapped reads in parallel
+    INPUT:
+    - bam(str): bamfile path
+    OUTPUT:
+    - result(list) list of mapped reads name (str)
+    """
+    try:
+        bamfile = pysam.AlignmentFile(bam, "rb")
+        chrs = bamfile.references
+    except ValueError as e:
+        print(e)
+    extract_mapped_chr_partial = partial(extract_mapped_chr, bam=bam)
+    p = multiprocessing.Pool(processes)
+    res = p.map(extract_mapped_chr_partial, chrs)
+    p.close()
+    p.join()
+    result = [i for ares in res for i in ares]
+    return(result)
+
+
+def parse_fq(fq):
+    """
+    Parse a FASTQ file
+    INPUT:
+    - fq(str): path to fastq file
+    OUTPUT:
+    - fqd(dict): dictionary with read names as keys, seq and quality as values
+        in a list
+    """
+
+    def get_fq_reads(allreads):
+        fqd = {}
+        myflag = True
+        for line in allreads:
+            line = line.decode('utf-8').rstrip()
+            if myflag == True:
+                instrument = line.split()[0].split(":")[0]
+                myflag = False
+            if line.startswith(instrument):
+                seqname = line[1:].split()[0]
+                fqd[seqname] = []
+                continue
+            else:
+                fqd[seqname].append(line)
+        return(fqd)
+
+    if fq.endswith('.gz'):
+        with gzip.open(fq, 'rb') as allreads:
+            fqd = get_fq_reads(allreads)
+    else:
+        with open(fq, 'r') as allreads:
+            fqd = get_fq_reads(allreads)
+
+    return(fqd)
+
+
+def sort_mapped(fq_dict, mapped_reads):
+    """
+    Sort mapped reads from dictionary of fastq reads
+    INPUT:
+    - fq_dict(dict) dictionary with read names as keys, seq and quality as values
+        in a list
+    - mapped_reads(list) list of mapped reads
+    OUTPUT:
+    - mfqd(dict) dictionary with mapped read names as keys, seq and quality as values
+        in a list
+    - fqd(dict) dictionary with unmapped read names as key, unmapped/mapped (u|m), 
+        seq and quality as values in a list
+    """
+    fqd = {}
+    unmapped = [i for i in list(fq_dict.keys()) if i not in mapped_reads]
+    mapped = [i for i in list(fq_dict.keys()) if i in mapped_reads]
+    # print(unmap)
+    for r in unmapped:
+        fqd[r] = ['u']+fq_dict[r]
+    for r in mapped:
+        fqd[r] = ['m']+fq_dict[r]
+
+    return(fqd)
+
+
+def write_fq(fq_dict, fname, mode):
+    """
+    Write to fastq file
+    INPUT:
+    - fq_dict(dict) dictionary with unmapped read names as keys, seq and quality as values
+        in a list
+    - fname(string) Path to output fastq file
+    - mode(string) strip (remove read) or replace (replace read sequence) by Ns
+    """
+
+    if fname.endswith('.gz'):
+        with gzip.open(fname, 'wb') as f:
+            for k in list(fq_dict.keys()):
+                if mode == 'strip':
+                    # if unmapped, write all the read lines
+                    if fq_dict[k][0] == 'u':
+                        f.write(f"@{k}\n".encode())
+                        for i in fq_dict[k][1:]:
+                            f.write(f"{i}\n".encode())
+                    # if mapped, do not write the read lines
+                    elif fq_dict[k][0] == 'm':
+                        continue
+
+                elif mode == 'replace':
+                    # if unmapped, write all the read lines
+                    if fq_dict[k][0] == 'u':
+                        f.write(f"@{k}\n".encode())
+                        for i in fq_dict[k][1:]:
+                            f.write(f"{i}\n".encode())
+                    # if mapped, write all the read lines, but replace sequence
+                    # by N*(len(sequence))
+                    elif fq_dict[k][0] == 'm':
+                        f.write(f"@{k}\n".encode())
+                        f.write(f"{'N'*len(fq_dict[k][1])}\n".encode())
+                        for i in fq_dict[k][2:]:
+                            f.write(f"{i}\n".encode())
+
+    else:
+        with open(fname, 'w') as f:
+            for k in list(fq_dict.keys()):
+                if mode == 'strip':
+                    if fq_dict[k][0] == 'u':
+                        f.write(f"@{k}\n")
+                        for i in fq_dict[k][1:]:
+                            f.write(f"{i}\n")
+                    elif fq_dict[k][0] == 'm':
+                        continue
+                elif mode == 'replace':
+                    if fq_dict[k][0] == 'u':
+                        f.write(f"@{k}\n")
+                        for i in fq_dict[k][1:]:
+                            f.write(f"{i}\n")
+                    elif fq_dict[k][0] == 'm':
+                        f.write(f"@{k}\n")
+                        f.write(f"{'N'*len(fq_dict[k][1])}\n")
+                        for i in fq_dict[k][2:]:
+                            f.write(f"{i}\n")
+
+
+if __name__ == "__main__":
+    BAM, IN_FWD, IN_REV, OUT_FWD, OUT_REV, MODE, PROC = _get_args()
+
+    if IN_REV and not OUT_REV:
+        print('You specified an input reverse fastq, but no output reverse fastq')
+        sys.exit(1)
+
+    if OUT_FWD == None:
+        out_fwd = f"{IN_FWD.split('/')[-1].split('.')[0]}.r1.fq.gz"
+    else:
+        out_fwd = OUT_FWD
+
+    mapped_reads = extract_mapped(BAM, PROC)
+    fwd_dict = parse_fq(IN_FWD)
+    fwd_reads = sort_mapped(fwd_dict, mapped_reads)
+    write_fq(fwd_reads, out_fwd, MODE)
+    if IN_REV:
+        if OUT_REV == None:
+            out_rev = f"{IN_REV.split('/')[-1].split('.')[0]}.r2.fq.gz"
+        else:
+            out_rev = OUT_REV
+        rev_dict = parse_fq(IN_REV)
+        rev_reads = sort_mapped(rev_dict, mapped_reads)
+        write_fq(rev_reads, out_rev, MODE)
diff --git a/conf/base.config b/conf/base.config
@@ -64,6 +64,11 @@ process {
   withName: damageprofiler {
     errorStrategy = 'ignore'
   }
+
+  withName: extract_unmapped_reads {
+    cpus = { check_max(8 * task.attempt, 'cpus') }
+    memory = { check_max( 8.GB * task.attempt, 'memory' ) }
+  }
 }
 
 params {
diff --git a/docs/usage.md b/docs/usage.md
@@ -501,6 +501,18 @@ Default set to `1` and clipps off one base of the left or right side of reads. N
 
 By default, nf-core/eager uses hard clipping and sets clipped bases to `N` with quality `!` in the BAM output. Turn this on to use soft-clipping instead, masking reads at the read ends respectively using the CIGAR string.
 
+## Mapped reads Stripping
+
+These parameters are used for removing mapped reads from orginal fastq files, usually in the context of uploading the original fastq files to a read archive (SRA/ENA)
+
+### `--strip_input_fastq`
+
+Create pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data)
+
+### `--strip_mode`
+
+Read removal mode. Strip mapped reads completely (strip) or just replace mapped reads sequence by N (replace)
+
 ## Library-Type Parameters
 
 These parameters are required in some cases, e.g. when performing in-solution SNP capture protocols (390K,1240K, ...) for population genetics for example. Make sure to specify the required parameters in such cases. 
diff --git a/environment.yml b/environment.yml
@@ -28,4 +28,6 @@ dependencies:
   - bioconda::fastp=0.19.7
   - bioconda::bamutil=1.0.14
   - bioconda::mtnucratio=0.5
+  - pysam=0.15.2
+  - python=3.6
   #Missing Schmutzi,snpAD
diff --git a/main.nf b/main.nf

Original file line number	Diff line number	Diff line change
`@@ -64,6 +64,11 @@ process {`
`64`	`64`	`withName: damageprofiler {`
`65`	`65`	`errorStrategy = 'ignore'`
`66`	`66`	`}`
	`67`	`+`
	`68`	`+ withName: extract_unmapped_reads {`
	`69`	`+ cpus = { check_max(8 * task.attempt, 'cpus') }`
	`70`	`+ memory = { check_max( 8.GB * task.attempt, 'memory' ) }`
	`71`	`+ }`
`67`	`72`	`}`
`68`	`73`
`69`	`74`	`params {`