Skip to content

Commit 9a19801

Browse files
authored
Merge pull request #531 from nf-core/olgabot-patch-1
Rename "Stripping" section to "Host Removal"
2 parents 588e470 + da4a0a1 commit 9a19801

7 files changed

Lines changed: 68 additions & 74 deletions

File tree

.github/workflows/ci.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,9 @@ jobs:
103103
- name: MAPPER_BT2 Test running with BowTie2
104104
run: |
105105
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --mapper 'bowtie2' --bt2_alignmode 'local' --bt2_sensitivity 'sensitive' --bt2n 1 --bt2l 16 --bt2_trim5 1 --bt2_trim3 1
106-
- name: STRIP_FASTQ Run the basic pipeline with output unmapped reads as fastq
106+
- name: HOST_REMOVAL_FASTQ Run the basic pipeline with output unmapped reads as fastq
107107
run: |
108-
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_complex,docker --strip_input_fastq
108+
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_complex,docker --hostremoval_input_fastq
109109
- name: BAM_FILTERING Run basic mapping pipeline with mapping quality filtering, and unmapped export
110110
run: |
111111
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_mapping_quality_threshold 37 --bam_unmapped_type 'fastq'

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
5656
* [#516](https://github.com/nf-core/eager/issues/516) - Made bedtools not report out of memory exit code when warning of inconsistant FASTA/Bed entry names
5757
* [#504](https://github.com/nf-core/eager/issues/504) - Removed uninformative sexdeterrmine-snps plot from MultiQC report.
5858
* Nuclear contamination is now reported with the correct library names.
59+
* [#531](https://github.com/nf-core/eager/pull/531) - Renamed 'FASTQ stripping' to 'host removal'
5960
* Merged all tutorials and FAQs into `usage.md` for display on nf-co.re
6061
* Corrected header of nuclear contamination table (`nuclear_contamination.txt`).
6162
* Fixed a bug with `nSNPs` definition in `print_x_contamination.py`. Number of SNPs now correctly reported.

bin/extract_map_reads.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ def _get_args():
3636
parser.add_argument(
3737
'-m',
3838
dest='mode',
39-
default='strip',
40-
help='Read removal mode: remove reads (strip) or replace sequence by N (replace)'
39+
default='remove',
40+
help='Read removal mode: remove reads (remove) or replace sequence by N (replace)'
4141
)
4242
parser.add_argument(
4343
'-p',
@@ -179,27 +179,27 @@ def difference(list1, list2):
179179
return(fqd)
180180

181181

182-
def write_fq(fq_dict, fname, write_mode, strip_mode, proc):
182+
def write_fq(fq_dict, fname, write_mode, remove_mode, proc):
183183
"""Write to fastq file
184184
Args:
185185
fq_dict(dict): fq_dict with unmapped read names as keys,
186186
unmapped/mapped (u|m), seq, and quality as values in a list
187187
fname(string) Path to output fastq file
188188
write_mode (str): 'rb' or 'r'
189-
strip_mode (str): strip (remove read) or replace (replace read sequence) by Ns
189+
remove_mode (str): remove (remove read) or replace (replace read sequence) by Ns
190190
proc(int) number of processes
191191
"""
192192
fq_dict_keys = list(fq_dict.keys())
193193
if write_mode == 'wb':
194194
with xopen(fname, mode='wb', threads=proc) as fw:
195195
for fq_dict_key in fq_dict_keys:
196196
wstring = ""
197-
if strip_mode == 'strip':
197+
if remove_mode == 'remove':
198198
if fq_dict[fq_dict_key][0] == 'u':
199199
wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n"
200200
for i in fq_dict[fq_dict_key][2:]:
201201
wstring += f"{i}\n"
202-
elif strip_mode == 'replace':
202+
elif remove_mode == 'replace':
203203
# if unmapped, write all the read lines
204204
if fq_dict[fq_dict_key][0] == 'u':
205205
wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n"
@@ -217,12 +217,12 @@ def write_fq(fq_dict, fname, write_mode, strip_mode, proc):
217217
with open(fname, 'w') as fw:
218218
for fq_dict_key in fq_dict_keys:
219219
wstring = ""
220-
if strip_mode == 'strip':
220+
if remove_mode == 'remove':
221221
if fq_dict[fq_dict_key][0] == 'u':
222222
wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n"
223223
for i in fq_dict[fq_dict_key][2:]:
224224
wstring += f"{i}\n"
225-
elif strip_mode == 'replace':
225+
elif remove_mode == 'replace':
226226
# if unmapped, write all the read lines
227227
if fq_dict[fq_dict_key][0] == 'u':
228228
wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n"
@@ -238,8 +238,8 @@ def write_fq(fq_dict, fname, write_mode, strip_mode, proc):
238238
fw.write(wstring)
239239

240240

241-
def check_strip_mode(mode):
242-
if mode.lower() not in ['replace', 'strip']:
241+
def check_remove_mode(mode):
242+
if mode.lower() not in ['replace', 'remove']:
243243
print(f"Mode must be {' or '.join(mode)}")
244244
return(mode.lower())
245245

@@ -257,7 +257,7 @@ def check_strip_mode(mode):
257257
else:
258258
write_mode = "w"
259259

260-
strip_mode = check_strip_mode(MODE)
260+
remove_mode = check_remove_mode(MODE)
261261
BAMFILE = pysam.AlignmentFile(BAM, 'r')
262262

263263
# FORWARD OR SE FILE
@@ -270,7 +270,7 @@ def check_strip_mode(mode):
270270
# print(fq_dict_fwd)
271271
print(f"- Writing forward fq to {out_fwd}")
272272
write_fq(fq_dict=fq_dict_fwd, fname=out_fwd,
273-
write_mode=write_mode, strip_mode=strip_mode, proc=PROC)
273+
write_mode=write_mode, remove_mode=remove_mode, proc=PROC)
274274

275275
# REVERSE FILE
276276
if IN_REV:
@@ -284,4 +284,4 @@ def check_strip_mode(mode):
284284
fq_dict_rev = get_mapped_reads(fqd_rev, mapped_reads)
285285
print(f"- Writing reverse fq to {out_rev}")
286286
write_fq(fq_dict=fq_dict_rev, fname=out_rev,
287-
write_mode=write_mode, strip_mode=strip_mode, proc=PROC)
287+
write_mode=write_mode, remove_mode=remove_mode, proc=PROC)

docs/usage.md

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@
6262
- [`--skip_deduplication`](#--skip_deduplication)
6363
- [`--skip_damage_calculation`](#--skip_damage_calculation)
6464
- [`--skip_qualimap`](#--skip_qualimap)
65+
- [BAM Conversion Options](#bam-conversion-options)
66+
- [`--run_convertinputbam`](#--run_convertinputbam)
6567
- [Complexity Filtering Options](#complexity-filtering-options)
6668
- [`--complexity_filter_poly_g`](#--complexity_filter_poly_g)
6769
- [`--complexity_filter_poly_g_min`](#--complexity_filter_poly_g_min)
@@ -106,14 +108,7 @@
106108
- [Library Complexity Estimation Parameters](#library-complexity-estimation-parameters)
107109
- [`--preseq_step_size`](#--preseq_step_size)
108110
- [DNA Damage Assessment Parameters](#dna-damage-assessment-parameters)
109-
- [`--damageprofiler_length`](#--damageprofiler_length)
110-
- [`--damageprofiler_threshold`](#--damageprofiler_threshold)
111-
- [`--damageprofiler_yaxis`](#--damageprofiler_yaxis)
112-
- [`--run_pmdtools`](#--run_pmdtools)
113-
- [`--pmdtools_range`](#--pmdtools_range)
114-
- [`--pmdtools_threshold`](#--pmdtools_threshold)
115-
- [`--pmdtools_reference_mask`](#--pmdtools_reference_mask)
116-
- [`--pmdtools_max_reads`](#--pmdtools_max_reads)
111+
- [`--udg_type`](#--udg_type)
117112
- [Feature Annotation Statistics](#feature-annotation-statistics)
118113
- [`--run_bedtools_coverage`](#--run_bedtools_coverage)
119114
- [`--anno_file`](#--anno_file)
@@ -1305,7 +1300,7 @@ when left-over sequencing artefacts of in-line barcodes present Default: 0
13051300
Number of bases to trim of 3' (right) end of read prior alignment. Maybe useful
13061301
when left-over sequencing artefacts of in-line barcodes present Default: 0.
13071302

1308-
### Mapped Reads Stripping
1303+
### Removal of Host-Mapped Reads
13091304

13101305
These parameters are used for removing mapped reads from the original input
13111306
FASTQ files, usually in the context of uploading the original FASTQ files to a
@@ -1320,17 +1315,16 @@ your data to apply their own adapter removal/read merging procedures, while
13201315
maintaining anonyminity for sample donors - for example with microbiome
13211316
research.
13221317

1323-
If using TSV input, stripping is performed library, i.e. after lane merging.
1318+
If using TSV input, mapped read removal is performed per library, i.e. after lane merging.
13241319

1325-
#### `--strip_input_fastq`
1320+
#### `--hostremoval_input_fastq`
13261321

13271322
Create pre-Adapter Removal FASTQ files without reads that mapped to reference
13281323
(e.g. for public upload of privacy sensitive non-host data)
13291324

1330-
#### `--strip_mode`
1325+
#### `--hostremoval_mode`
13311326

1332-
Read removal mode. Strip mapped reads completely (`'strip'`) or just replace
1333-
mapped reads sequence by N (`'replace'`)
1327+
Read removal mode. Completely remove mapped reads from the file(s) (`'remove'`) or just replace mapped reads sequence by N (`'replace'`)
13341328

13351329
### Read Filtering and Conversion Parameters
13361330

main.nf

Lines changed: 31 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,9 @@ def helpMessage() {
9595
--bt2_trim5 [num] Specify number of bases to trim off from 5' (left) end of read before alignment. Default: ${params.bt2_trim5}
9696
--bt2_trim3 [num] Specify number of bases to trim off from 3' (right) end of read before alignment. Default: ${params.bt2_trim3}
9797
98-
Stripping
99-
--strip_input_fastq [bool] Turn on creation of per-library pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data).
100-
--strip_mode [str] Stripping mode. Remove mapped reads completely from FASTQ (strip) or just mask mapped reads sequence by N (replace). Default: '${params.strip_mode}'
98+
Host removal
99+
--hostremoval_input_fastq [bool] Turn on creating pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data)
100+
--hostremoval_mode [str] Host DNA Removal mode. Remove mapped reads completely from FASTQ (remove) or just mask mapped reads sequence by N (replace). Default: '${params.hostremoval_mode}'
101101
102102
BAM Filtering
103103
--run_bam_filtering [bool] Turn on filtering of mapping quality, read lengths, or unmapped reads of BAM files.
@@ -360,10 +360,10 @@ if (!has_extension(params.input, "tsv") && params.skip_collapse && params.singl
360360
exit 1, "[nf-core/eager] error: --skip_collapse can only be set for paired_end samples."
361361
}
362362

363-
// Strip mode validation
364-
if (params.strip_input_fastq){
365-
if (!(['strip','replace'].contains(params.strip_mode))) {
366-
exit 1, "[nf-core/eager] error: --strip_mode can only be set to strip or replace."
363+
// Host removal mode validation
364+
if (params.hostremoval_input_fastq){
365+
if (!(['remove','replace'].contains(params.hostremoval_mode))) {
366+
exit 1, "[nf-core/eager] error: --hostremoval_mode can only be set to remove or replace."
367367
}
368368
}
369369

@@ -657,9 +657,9 @@ ch_input_for_convertbam = Channel.empty()
657657
ch_bam_channel
658658
.into { ch_input_for_convertbam; ch_input_for_indexbam; }
659659

660-
// Also need to send raw files for lane merging, if we want to strip fastq
660+
// Also need to send raw files for lane merging, if we want to host removed fastq
661661
ch_fastq_channel
662-
.into { ch_input_for_skipconvertbam; ch_input_for_lanemerge_stripfastq }
662+
.into { ch_input_for_skipconvertbam; ch_input_for_lanemerge_hostremovalfastq }
663663

664664
///////////////////////////////////////////////////
665665
/* -- HEADER LOG INFO -- */
@@ -686,9 +686,9 @@ summary['Running BAM filtering'] = params.run_bam_filtering ? 'Yes' : 'No'
686686
if (params.run_bam_filtering) {
687687
summary['Skip Read Merging'] = params.bam_unmapped_type
688688
}
689-
summary['Run Fastq Stripping'] = params.strip_input_fastq ? 'Yes' : 'No'
690-
if (params.strip_input_fastq){
691-
summary['Strip mode'] = params.strip_mode
689+
summary['Run Fastq Host Removal'] = params.hostremoval_input_fastq ? 'Yes' : 'No'
690+
if (params.hostremoval_input_fastq){
691+
summary['Host removal mode'] = params.hostremoval_mode
692692
}
693693
summary['Skipping Preseq?'] = params.skip_preseq ? 'Yes' : 'No'
694694
summary['Skipping Deduplication?'] = params.skip_deduplication ? 'Yes' : 'No'
@@ -1333,21 +1333,21 @@ if ( ( params.skip_collapse || params.skip_adapterremoval ) ) {
13331333
.into { ch_lanemerge_for_skipmap; ch_lanemerge_for_bwa; ch_lanemerge_for_cm; ch_lanemerge_for_bwamem; ch_lanemerge_for_bt2 }
13341334
}
13351335

1336-
// ENA upload doesn't do separate lanes, so merge raw FASTQs for mapped-reads stripping
1336+
// ENA upload doesn't do separate lanes, so merge raw FASTQs for mapped-reads removal
13371337

13381338
// Per-library lane grouping done within process
1339-
process lanemerge_stripfastq {
1339+
process lanemerge_hostremoval_fastq {
13401340
label 'sc_tiny'
13411341
tag "${libraryid}"
13421342

13431343
when:
1344-
params.strip_input_fastq
1344+
params.hostremoval_input_fastq
13451345

13461346
input:
1347-
tuple samplename, libraryid, lane, colour, seqtype, organism, strandedness, udg, file(r1), file(r2) from ch_input_for_lanemerge_stripfastq.groupTuple(by: [0,1,3,4,5,6,7])
1347+
tuple samplename, libraryid, lane, colour, seqtype, organism, strandedness, udg, file(r1), file(r2) from ch_input_for_lanemerge_hostremovalfastq.groupTuple(by: [0,1,3,4,5,6,7])
13481348

13491349
output:
1350-
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*.fq.gz") into ch_fastqlanemerge_for_stripfastq
1350+
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.fq.gz") into ch_fastqlanemerge_for_hostremovalfastq
13511351

13521352
script:
13531353
if ( seqtype == 'PE' ){
@@ -1623,10 +1623,10 @@ process bowtie2 {
16231623

16241624
// Gather all mapped BAMs from all possible mappers into common channels to send downstream
16251625
ch_output_from_bwa.mix(ch_output_from_bwamem, ch_output_from_cm, ch_indexbam_for_filtering, ch_output_from_bt2)
1626-
.into { ch_mapping_for_stripfastq; ch_mapping_for_seqtype_merging }
1626+
.into { ch_mapping_for_hostremovalfastq; ch_mapping_for_seqtype_merging }
16271627

16281628
// Synchronise the mapped input FASTQ and input non-remapped BAM channels
1629-
ch_fastqlanemerge_for_stripfastq
1629+
ch_fastqlanemerge_for_hostremovalfastq
16301630
.map {
16311631
def samplename = it[0]
16321632
def libraryid = it[1]
@@ -1641,7 +1641,7 @@ ch_fastqlanemerge_for_stripfastq
16411641
[ samplename, libraryid, lane, seqtype, organism, strandedness, udg, r1, r2 ]
16421642

16431643
}
1644-
.mix(ch_mapping_for_stripfastq)
1644+
.mix(ch_mapping_for_hostremovalfastq)
16451645
.groupTuple(by: [0,1,3,4,5,6])
16461646
.map {
16471647
def samplename = it[0]
@@ -1660,38 +1660,37 @@ ch_fastqlanemerge_for_stripfastq
16601660

16611661
}
16621662
.filter{ it[8] != null }
1663-
.dump(tag: "StripFastq Input")
1664-
.set { ch_synced_for_stripfastq }
1663+
.set { ch_synced_for_hostremovalfastq }
16651664

16661665
// Remove mapped reads from original (lane merged) input FASTQ e.g. for sensitive host data when running metagenomic data
16671666

1668-
process strip_input_fastq {
1667+
process hostremoval_input_fastq {
16691668
label 'mc_medium'
16701669
tag "${libraryid}"
1671-
publishDir "${params.outdir}/stripped_fastq", mode: params.publish_dir_mode
1670+
publishDir "${params.outdir}/hostremoved_fastq", mode: params.publish_dir_mode
16721671

16731672
when:
1674-
params.strip_input_fastq
1673+
params.hostremoval_input_fastq
16751674

16761675
input:
1677-
tuple samplename, libraryid, seqtype, organism, strandedness, udg, path(r1), path(r2), file(bam), file(bai) from ch_synced_for_stripfastq
1676+
tuple samplename, libraryid, seqtype, organism, strandedness, udg, file(r1), file(r2), file(bam), file(bai) from ch_synced_for_hostremovalfastq
16781677

16791678
output:
1680-
tuple samplename, libraryid, seqtype, organism, strandedness, udg, file("*.fq.gz") into ch_output_from_stripfastq
1679+
tuple samplename, libraryid, seqtype, organism, strandedness, udg, file("*.fq.gz") into ch_output_from_hostremovalfastq
16811680

16821681
script:
16831682
if ( seqtype == 'SE' ) {
1684-
out_fwd = bam.baseName+'.stripped.fq.gz'
1683+
out_fwd = bam.baseName+'.hostremoved.fq.gz'
16851684
"""
16861685
samtools index $bam
1687-
extract_map_reads.py $bam ${r1} -m ${params.strip_mode} -of $out_fwd -p ${task.cpus}
1686+
extract_map_reads.py $bam ${r1} -m ${params.hostremoval_mode} -of $out_fwd -p ${task.cpus}
16881687
"""
16891688
} else {
1690-
out_fwd = bam.baseName+'.stripped.fwd.fq.gz'
1691-
out_rev = bam.baseName+'.stripped.rev.fq.gz'
1689+
out_fwd = bam.baseName+'.hostremoved.fwd.fq.gz'
1690+
out_rev = bam.baseName+'.hostremoved.rev.fq.gz'
16921691
"""
16931692
samtools index $bam
1694-
extract_map_reads.py $bam ${r1} -rev ${r2} -m ${params.strip_mode} -of $out_fwd -or $out_rev -p ${task.cpus}
1693+
extract_map_reads.py $bam ${r1} -rev ${r2} -m ${params.hostremoval_mode} -of $out_fwd -or $out_rev -p ${task.cpus}
16951694
"""
16961695
}
16971696

nextflow.config

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,9 @@ params {
7777
bt2_trim5 = 0
7878
bt2_trim3 = 0
7979

80-
//Mapped read stripping from input FASTQ
81-
strip_input_fastq = false
82-
strip_mode = 'strip'
80+
//Mapped read removal from input FASTQ
81+
hostremoval_input_fastq = false
82+
hostremoval_mode = 'remove'
8383

8484
//BAM Filtering steps (default = discard unmapped reads)
8585
run_bam_filtering = false
@@ -117,7 +117,7 @@ params {
117117
bamutils_clip_half_udg_right = 1
118118
bamutils_clip_none_udg_left = 1
119119
bamutils_clip_none_udg_right = 1
120-
bamutils_softclip = false
120+
bamutils_softclip = false
121121

122122
//Genotyping options
123123
run_genotyping = false

nextflow_schema.json

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -570,24 +570,24 @@
570570
},
571571
"fa_icon": "fas fa-layer-group"
572572
},
573-
"stripping": {
574-
"title": "Stripping",
573+
"host_removal": {
574+
"title": "Removal of Host-Mapped Reads",
575575
"type": "object",
576576
"description": "Options for production of host-read removed FASTQ files for privacy reasons.",
577577
"default": "",
578578
"properties": {
579-
"strip_input_fastq": {
579+
"hostremoval_input_fastq": {
580580
"type": "boolean",
581581
"description": "Turn on per-library creation pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data)",
582582
"fa_icon": "fas fa-power-off",
583583
"help_text": "Create pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data)\n"
584584
},
585-
"strip_mode": {
585+
"hostremoval_mode": {
586586
"type": "string",
587-
"default": "strip",
588-
"description": "Stripping mode. Remove mapped reads completely from FASTQ (strip) or just mask mapped reads sequence by N (replace).",
587+
"default": "remove",
588+
"description": "Host removal mode. Remove mapped reads completely from FASTQ (remove) or just mask mapped reads sequence by N (replace).",
589589
"fa_icon": "fas fa-mask",
590-
"help_text": "Read removal mode. Strip mapped reads completely (`'strip'`) or just replace mapped reads sequence by N (`'replace'`)\n"
590+
"help_text": "Read removal mode. Remove mapped reads completely (`'remove'`) or just replace mapped reads sequence by N (`'replace'`)\n"
591591
}
592592
},
593593
"fa_icon": "fas fa-user-shield"
@@ -1380,7 +1380,7 @@
13801380
"$ref": "#/definitions/mapping"
13811381
},
13821382
{
1383-
"$ref": "#/definitions/stripping"
1383+
"$ref": "#/definitions/host_removal"
13841384
},
13851385
{
13861386
"$ref": "#/definitions/bam_filtering"
@@ -1425,4 +1425,4 @@
14251425
"$ref": "#/definitions/metagenomic_authentication"
14261426
}
14271427
]
1428-
}
1428+
}

0 commit comments

Comments
 (0)