Skip to content

Commit 4d2f062

Browse files
authored
Merge pull request #189 from maxibor/dev
Fastq from unmapped reads
2 parents b0e84f8 + 8f17924 commit 4d2f062

7 files changed

Lines changed: 336 additions & 7 deletions

File tree

.travis.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ script:
5050
- nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --skip_trim --saveReference
5151
# Run the basic pipeline with paired end data without adapterRemoval
5252
- nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --skip_adapterremoval --saveReference
53+
# Run the basic pipeline with output unmapped reads as fastq
54+
- nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --strip_input_fastq
5355
# Run the same pipeline testing optional step: fastp, complexity
5456
- nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --complexity_filter --bwa_index results/reference_genome/bwa_index/bwa_index/
5557
# Test BAM Trimming

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
88

99
### `Added`
1010

11+
* [#189](https://github.com/nf-core/eager/pull/189) - Outputing unmapped reads in a fastq files with the --strip_input_fastq flag
1112
* [#186](https://github.com/nf-core/eager/pull/186) - Make FastQC skipping [possible]
1213
/(https://github.com/nf-core/eager/issues/182)
1314
* Merged in [nf-core/tools](https://github.com/nf-core/tools) release V1.6 template changes

bin/extract_map_reads.py

Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
import multiprocessing
5+
import pysam
6+
from functools import partial
7+
import gzip
8+
import sys
9+
10+
11+
def _get_args():
12+
'''This function parses and return arguments passed in'''
13+
parser = argparse.ArgumentParser(
14+
prog='extract_mapped_reads',
15+
formatter_class=argparse.RawDescriptionHelpFormatter,
16+
description=f'''
17+
Remove mapped in bam file from fastq files
18+
''')
19+
parser.add_argument('bam_file', help="path to bam file")
20+
parser.add_argument('fwd', help='path to forward fastq file')
21+
parser.add_argument(
22+
'-2',
23+
dest="rev",
24+
default=None,
25+
help="path to forward fastq file")
26+
parser.add_argument(
27+
'-of',
28+
dest="out_fwd",
29+
default=None,
30+
help="path to forward output fastq file")
31+
parser.add_argument(
32+
'-or',
33+
dest="out_rev",
34+
default=None,
35+
help="path to forward output fastq file")
36+
parser.add_argument(
37+
'-m',
38+
dest='mode',
39+
default='strip',
40+
help='Read removal mode: remove reads (strip) or replace sequence by N (replace)'
41+
)
42+
parser.add_argument(
43+
'-p',
44+
dest='process',
45+
default=4,
46+
help='Number of parallel processes'
47+
)
48+
49+
args = parser.parse_args()
50+
51+
bam = args.bam_file
52+
in_fwd = args.fwd
53+
in_rev = args.rev
54+
out_fwd = args.out_fwd
55+
out_rev = args.out_rev
56+
mode = args.mode
57+
proc = int(args.process)
58+
59+
return(bam, in_fwd, in_rev, out_fwd, out_rev, mode, proc)
60+
61+
62+
def extract_mapped_chr(chr, bam):
63+
"""
64+
Get mapped reads per chromosome
65+
INPUT:
66+
- chr(str): chromosome
67+
- bam(str): bamfile path
68+
OUTPUT:
69+
- res(list): list of mapped reads (str) name per chromosome
70+
"""
71+
res = []
72+
bamfile = pysam.AlignmentFile(bam, "rb")
73+
reads = bamfile.fetch(chr, multiple_iterators=True)
74+
for read in reads:
75+
res.append(read.query_name)
76+
return(res)
77+
78+
79+
def extract_mapped(bam, processes):
80+
"""
81+
Get mapped reads in parallel
82+
INPUT:
83+
- bam(str): bamfile path
84+
OUTPUT:
85+
- result(list) list of mapped reads name (str)
86+
"""
87+
try:
88+
bamfile = pysam.AlignmentFile(bam, "rb")
89+
chrs = bamfile.references
90+
except ValueError as e:
91+
print(e)
92+
extract_mapped_chr_partial = partial(extract_mapped_chr, bam=bam)
93+
p = multiprocessing.Pool(processes)
94+
res = p.map(extract_mapped_chr_partial, chrs)
95+
p.close()
96+
p.join()
97+
result = [i for ares in res for i in ares]
98+
return(result)
99+
100+
101+
def parse_fq(fq):
102+
"""
103+
Parse a FASTQ file
104+
INPUT:
105+
- fq(str): path to fastq file
106+
OUTPUT:
107+
- fqd(dict): dictionary with read names as keys, seq and quality as values
108+
in a list
109+
"""
110+
111+
def get_fq_reads(allreads):
112+
fqd = {}
113+
myflag = True
114+
for line in allreads:
115+
line = line.decode('utf-8').rstrip()
116+
if myflag == True:
117+
instrument = line.split()[0].split(":")[0]
118+
myflag = False
119+
if line.startswith(instrument):
120+
seqname = line[1:].split()[0]
121+
fqd[seqname] = []
122+
continue
123+
else:
124+
fqd[seqname].append(line)
125+
return(fqd)
126+
127+
if fq.endswith('.gz'):
128+
with gzip.open(fq, 'rb') as allreads:
129+
fqd = get_fq_reads(allreads)
130+
else:
131+
with open(fq, 'r') as allreads:
132+
fqd = get_fq_reads(allreads)
133+
134+
return(fqd)
135+
136+
137+
def sort_mapped(fq_dict, mapped_reads):
138+
"""
139+
Sort mapped reads from dictionary of fastq reads
140+
INPUT:
141+
- fq_dict(dict) dictionary with read names as keys, seq and quality as values
142+
in a list
143+
- mapped_reads(list) list of mapped reads
144+
OUTPUT:
145+
- mfqd(dict) dictionary with mapped read names as keys, seq and quality as values
146+
in a list
147+
- fqd(dict) dictionary with unmapped read names as key, unmapped/mapped (u|m),
148+
seq and quality as values in a list
149+
"""
150+
fqd = {}
151+
unmapped = [i for i in list(fq_dict.keys()) if i not in mapped_reads]
152+
mapped = [i for i in list(fq_dict.keys()) if i in mapped_reads]
153+
# print(unmap)
154+
for r in unmapped:
155+
fqd[r] = ['u']+fq_dict[r]
156+
for r in mapped:
157+
fqd[r] = ['m']+fq_dict[r]
158+
159+
return(fqd)
160+
161+
162+
def write_fq(fq_dict, fname, mode):
163+
"""
164+
Write to fastq file
165+
INPUT:
166+
- fq_dict(dict) dictionary with unmapped read names as keys, seq and quality as values
167+
in a list
168+
- fname(string) Path to output fastq file
169+
- mode(string) strip (remove read) or replace (replace read sequence) by Ns
170+
"""
171+
172+
if fname.endswith('.gz'):
173+
with gzip.open(fname, 'wb') as f:
174+
for k in list(fq_dict.keys()):
175+
if mode == 'strip':
176+
# if unmapped, write all the read lines
177+
if fq_dict[k][0] == 'u':
178+
f.write(f"@{k}\n".encode())
179+
for i in fq_dict[k][1:]:
180+
f.write(f"{i}\n".encode())
181+
# if mapped, do not write the read lines
182+
elif fq_dict[k][0] == 'm':
183+
continue
184+
185+
elif mode == 'replace':
186+
# if unmapped, write all the read lines
187+
if fq_dict[k][0] == 'u':
188+
f.write(f"@{k}\n".encode())
189+
for i in fq_dict[k][1:]:
190+
f.write(f"{i}\n".encode())
191+
# if mapped, write all the read lines, but replace sequence
192+
# by N*(len(sequence))
193+
elif fq_dict[k][0] == 'm':
194+
f.write(f"@{k}\n".encode())
195+
f.write(f"{'N'*len(fq_dict[k][1])}\n".encode())
196+
for i in fq_dict[k][2:]:
197+
f.write(f"{i}\n".encode())
198+
199+
else:
200+
with open(fname, 'w') as f:
201+
for k in list(fq_dict.keys()):
202+
if mode == 'strip':
203+
if fq_dict[k][0] == 'u':
204+
f.write(f"@{k}\n")
205+
for i in fq_dict[k][1:]:
206+
f.write(f"{i}\n")
207+
elif fq_dict[k][0] == 'm':
208+
continue
209+
elif mode == 'replace':
210+
if fq_dict[k][0] == 'u':
211+
f.write(f"@{k}\n")
212+
for i in fq_dict[k][1:]:
213+
f.write(f"{i}\n")
214+
elif fq_dict[k][0] == 'm':
215+
f.write(f"@{k}\n")
216+
f.write(f"{'N'*len(fq_dict[k][1])}\n")
217+
for i in fq_dict[k][2:]:
218+
f.write(f"{i}\n")
219+
220+
221+
if __name__ == "__main__":
222+
BAM, IN_FWD, IN_REV, OUT_FWD, OUT_REV, MODE, PROC = _get_args()
223+
224+
if IN_REV and not OUT_REV:
225+
print('You specified an input reverse fastq, but no output reverse fastq')
226+
sys.exit(1)
227+
228+
if OUT_FWD == None:
229+
out_fwd = f"{IN_FWD.split('/')[-1].split('.')[0]}.r1.fq.gz"
230+
else:
231+
out_fwd = OUT_FWD
232+
233+
mapped_reads = extract_mapped(BAM, PROC)
234+
fwd_dict = parse_fq(IN_FWD)
235+
fwd_reads = sort_mapped(fwd_dict, mapped_reads)
236+
write_fq(fwd_reads, out_fwd, MODE)
237+
if IN_REV:
238+
if OUT_REV == None:
239+
out_rev = f"{IN_REV.split('/')[-1].split('.')[0]}.r2.fq.gz"
240+
else:
241+
out_rev = OUT_REV
242+
rev_dict = parse_fq(IN_REV)
243+
rev_reads = sort_mapped(rev_dict, mapped_reads)
244+
write_fq(rev_reads, out_rev, MODE)

conf/base.config

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,11 @@ process {
6464
withName: damageprofiler {
6565
errorStrategy = 'ignore'
6666
}
67+
68+
withName: extract_unmapped_reads {
69+
cpus = { check_max(8 * task.attempt, 'cpus') }
70+
memory = { check_max( 8.GB * task.attempt, 'memory' ) }
71+
}
6772
}
6873

6974
params {

docs/usage.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,18 @@ Default set to `1` and clipps off one base of the left or right side of reads. N
501501

502502
By default, nf-core/eager uses hard clipping and sets clipped bases to `N` with quality `!` in the BAM output. Turn this on to use soft-clipping instead, masking reads at the read ends respectively using the CIGAR string.
503503

504+
## Mapped reads Stripping
505+
506+
These parameters are used for removing mapped reads from orginal fastq files, usually in the context of uploading the original fastq files to a read archive (SRA/ENA)
507+
508+
### `--strip_input_fastq`
509+
510+
Create pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data)
511+
512+
### `--strip_mode`
513+
514+
Read removal mode. Strip mapped reads completely (strip) or just replace mapped reads sequence by N (replace)
515+
504516
## Library-Type Parameters
505517

506518
These parameters are required in some cases, e.g. when performing in-solution SNP capture protocols (390K,1240K, ...) for population genetics for example. Make sure to specify the required parameters in such cases.

environment.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,6 @@ dependencies:
2828
- bioconda::fastp=0.19.7
2929
- bioconda::bamutil=1.0.14
3030
- bioconda::mtnucratio=0.5
31+
- pysam=0.15.2
32+
- python=3.6
3133
#Missing Schmutzi,snpAD

0 commit comments

Comments
 (0)