Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,9 @@ jobs:
- name: MAPPER_BT2 Test running with BowTie2
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --mapper 'bowtie2' --bt2_alignmode 'local' --bt2_sensitivity 'sensitive' --bt2n 1 --bt2l 16 --bt2_trim5 1 --bt2_trim3 1
- name: STRIP_FASTQ Run the basic pipeline with output unmapped reads as fastq
- name: HOST_REMOVAL_FASTQ Run the basic pipeline with output unmapped reads as fastq
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_complex,docker --strip_input_fastq
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_complex,docker --hostremoval_input_fastq
- name: BAM_FILTERING Run basic mapping pipeline with mapping quality filtering, and unmapped export
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_mapping_quality_threshold 37 --bam_unmapped_type 'fastq'
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
* [#516](https://github.com/nf-core/eager/issues/516) - Made bedtools not report out of memory exit code when warning of inconsistant FASTA/Bed entry names
* [#504](https://github.com/nf-core/eager/issues/504) - Removed uninformative sexdeterrmine-snps plot from MultiQC report.
* Nuclear contamination is now reported with the correct library names.
* [#531](https://github.com/nf-core/eager/pull/531) - Renamed 'FASTQ stripping' to 'host removal'
* Merged all tutorials and FAQs into `usage.md` for display on nf-co.re

### `Dependencies`
Expand Down
26 changes: 13 additions & 13 deletions bin/extract_map_reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ def _get_args():
parser.add_argument(
'-m',
dest='mode',
default='strip',
help='Read removal mode: remove reads (strip) or replace sequence by N (replace)'
default='remove',
help='Read removal mode: remove reads (remove) or replace sequence by N (replace)'
)
parser.add_argument(
'-p',
Expand Down Expand Up @@ -179,27 +179,27 @@ def difference(list1, list2):
return(fqd)


def write_fq(fq_dict, fname, write_mode, strip_mode, proc):
def write_fq(fq_dict, fname, write_mode, remove_mode, proc):
"""Write to fastq file
Args:
fq_dict(dict): fq_dict with unmapped read names as keys,
unmapped/mapped (u|m), seq, and quality as values in a list
fname(string) Path to output fastq file
write_mode (str): 'rb' or 'r'
strip_mode (str): strip (remove read) or replace (replace read sequence) by Ns
remove_mode (str): remove (remove read) or replace (replace read sequence) by Ns
proc(int) number of processes
"""
fq_dict_keys = list(fq_dict.keys())
if write_mode == 'wb':
with xopen(fname, mode='wb', threads=proc) as fw:
for fq_dict_key in fq_dict_keys:
wstring = ""
if strip_mode == 'strip':
if remove_mode == 'remove':
if fq_dict[fq_dict_key][0] == 'u':
wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n"
for i in fq_dict[fq_dict_key][2:]:
wstring += f"{i}\n"
elif strip_mode == 'replace':
elif remove_mode == 'replace':
# if unmapped, write all the read lines
if fq_dict[fq_dict_key][0] == 'u':
wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n"
Expand All @@ -217,12 +217,12 @@ def write_fq(fq_dict, fname, write_mode, strip_mode, proc):
with open(fname, 'w') as fw:
for fq_dict_key in fq_dict_keys:
wstring = ""
if strip_mode == 'strip':
if remove_mode == 'remove':
if fq_dict[fq_dict_key][0] == 'u':
wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n"
for i in fq_dict[fq_dict_key][2:]:
wstring += f"{i}\n"
elif strip_mode == 'replace':
elif remove_mode == 'replace':
# if unmapped, write all the read lines
if fq_dict[fq_dict_key][0] == 'u':
wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n"
Expand All @@ -238,8 +238,8 @@ def write_fq(fq_dict, fname, write_mode, strip_mode, proc):
fw.write(wstring)


def check_strip_mode(mode):
if mode.lower() not in ['replace', 'strip']:
def check_remove_mode(mode):
if mode.lower() not in ['replace', 'remove']:
print(f"Mode must be {' or '.join(mode)}")
return(mode.lower())

Expand All @@ -257,7 +257,7 @@ def check_strip_mode(mode):
else:
write_mode = "w"

strip_mode = check_strip_mode(MODE)
remove_mode = check_remove_mode(MODE)
BAMFILE = pysam.AlignmentFile(BAM, 'r')

# FORWARD OR SE FILE
Expand All @@ -270,7 +270,7 @@ def check_strip_mode(mode):
# print(fq_dict_fwd)
print(f"- Writing forward fq to {out_fwd}")
write_fq(fq_dict=fq_dict_fwd, fname=out_fwd,
write_mode=write_mode, strip_mode=strip_mode, proc=PROC)
write_mode=write_mode, remove_mode=remove_mode, proc=PROC)

# REVERSE FILE
if IN_REV:
Expand All @@ -284,4 +284,4 @@ def check_strip_mode(mode):
fq_dict_rev = get_mapped_reads(fqd_rev, mapped_reads)
print(f"- Writing reverse fq to {out_rev}")
write_fq(fq_dict=fq_dict_rev, fname=out_rev,
write_mode=write_mode, strip_mode=strip_mode, proc=PROC)
write_mode=write_mode, remove_mode=remove_mode, proc=PROC)
22 changes: 8 additions & 14 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@
- [`--skip_deduplication`](#--skip_deduplication)
- [`--skip_damage_calculation`](#--skip_damage_calculation)
- [`--skip_qualimap`](#--skip_qualimap)
- [BAM Conversion Options](#bam-conversion-options)
- [`--run_convertinputbam`](#--run_convertinputbam)
- [Complexity Filtering Options](#complexity-filtering-options)
- [`--complexity_filter_poly_g`](#--complexity_filter_poly_g)
- [`--complexity_filter_poly_g_min`](#--complexity_filter_poly_g_min)
Expand Down Expand Up @@ -106,14 +108,7 @@
- [Library Complexity Estimation Parameters](#library-complexity-estimation-parameters)
- [`--preseq_step_size`](#--preseq_step_size)
- [DNA Damage Assessment Parameters](#dna-damage-assessment-parameters)
- [`--damageprofiler_length`](#--damageprofiler_length)
- [`--damageprofiler_threshold`](#--damageprofiler_threshold)
- [`--damageprofiler_yaxis`](#--damageprofiler_yaxis)
- [`--run_pmdtools`](#--run_pmdtools)
- [`--pmdtools_range`](#--pmdtools_range)
- [`--pmdtools_threshold`](#--pmdtools_threshold)
- [`--pmdtools_reference_mask`](#--pmdtools_reference_mask)
- [`--pmdtools_max_reads`](#--pmdtools_max_reads)
- [`--udg_type`](#--udg_type)
- [Feature Annotation Statistics](#feature-annotation-statistics)
- [`--run_bedtools_coverage`](#--run_bedtools_coverage)
- [`--anno_file`](#--anno_file)
Expand Down Expand Up @@ -1303,7 +1298,7 @@ when left-over sequencing artefacts of in-line barcodes present Default: 0
Number of bases to trim of 3' (right) end of read prior alignment. Maybe useful
when left-over sequencing artefacts of in-line barcodes present Default: 0.

### Mapped Reads Stripping
### Removal of Host-Mapped Reads

These parameters are used for removing mapped reads from the original input
FASTQ files, usually in the context of uploading the original FASTQ files to a
Expand All @@ -1318,17 +1313,16 @@ your data to apply their own adapter removal/read merging procedures, while
maintaining anonyminity for sample donors - for example with microbiome
research.

If using TSV input, stripping is performed library, i.e. after lane merging.
If using TSV input, mapped read removal is performed per library, i.e. after lane merging.

#### `--strip_input_fastq`
#### `--hostremoval_input_fastq`

Create pre-Adapter Removal FASTQ files without reads that mapped to reference
(e.g. for public upload of privacy sensitive non-host data)

#### `--strip_mode`
#### `--hostremoval_mode`

Read removal mode. Strip mapped reads completely (`'strip'`) or just replace
mapped reads sequence by N (`'replace'`)
Read removal mode. Completely remove mapped reads from the file(s) (`'remove'`) or just replace mapped reads sequence by N (`'replace'`)

### Read Filtering and Conversion Parameters

Expand Down
63 changes: 31 additions & 32 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,9 @@ def helpMessage() {
--bt2_trim5 [num] Specify number of bases to trim off from 5' (left) end of read before alignment. Default: ${params.bt2_trim5}
--bt2_trim3 [num] Specify number of bases to trim off from 3' (right) end of read before alignment. Default: ${params.bt2_trim3}

Stripping
--strip_input_fastq [bool] Turn on creation of per-library pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data).
--strip_mode [str] Stripping mode. Remove mapped reads completely from FASTQ (strip) or just mask mapped reads sequence by N (replace). Default: '${params.strip_mode}'
Host removal
--hostremoval_input_fastq [bool] Turn on creating pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data)
--hostremoval_mode [str] Host DNA Removal mode. Remove mapped reads completely from FASTQ (remove) or just mask mapped reads sequence by N (replace). Default: '${params.hostremoval_mode}'

BAM Filtering
--run_bam_filtering [bool] Turn on filtering of mapping quality, read lengths, or unmapped reads of BAM files.
Expand Down Expand Up @@ -359,10 +359,10 @@ if (!has_extension(params.input, "tsv") && params.skip_collapse && params.singl
exit 1, "[nf-core/eager] error: --skip_collapse can only be set for paired_end samples."
}

// Strip mode validation
if (params.strip_input_fastq){
if (!(['strip','replace'].contains(params.strip_mode))) {
exit 1, "[nf-core/eager] error: --strip_mode can only be set to strip or replace."
// Host removal mode validation
if (params.hostremoval_input_fastq){
if (!(['remove','replace'].contains(params.hostremoval_mode))) {
exit 1, "[nf-core/eager] error: --hostremoval_mode can only be set to remove or replace."
}
}

Expand Down Expand Up @@ -654,9 +654,9 @@ ch_input_for_convertbam = Channel.empty()
ch_bam_channel
.into { ch_input_for_convertbam; ch_input_for_indexbam; }

// Also need to send raw files for lane merging, if we want to strip fastq
// Also need to send raw files for lane merging, if we want to host removed fastq
ch_fastq_channel
.into { ch_input_for_skipconvertbam; ch_input_for_lanemerge_stripfastq }
.into { ch_input_for_skipconvertbam; ch_input_for_lanemerge_hostremovalfastq }

///////////////////////////////////////////////////
/* -- HEADER LOG INFO -- */
Expand All @@ -683,9 +683,9 @@ summary['Running BAM filtering'] = params.run_bam_filtering ? 'Yes' : 'No'
if (params.run_bam_filtering) {
summary['Skip Read Merging'] = params.bam_unmapped_type
}
summary['Run Fastq Stripping'] = params.strip_input_fastq ? 'Yes' : 'No'
if (params.strip_input_fastq){
summary['Strip mode'] = params.strip_mode
summary['Run Fastq Host Removal'] = params.hostremoval_input_fastq ? 'Yes' : 'No'
if (params.hostremoval_input_fastq){
summary['Host removal mode'] = params.hostremoval_mode
}
summary['Skipping Preseq?'] = params.skip_preseq ? 'Yes' : 'No'
summary['Skipping Deduplication?'] = params.skip_deduplication ? 'Yes' : 'No'
Expand Down Expand Up @@ -1330,21 +1330,21 @@ if ( ( params.skip_collapse || params.skip_adapterremoval ) ) {
.into { ch_lanemerge_for_skipmap; ch_lanemerge_for_bwa; ch_lanemerge_for_cm; ch_lanemerge_for_bwamem; ch_lanemerge_for_bt2 }
}

// ENA upload doesn't do separate lanes, so merge raw FASTQs for mapped-reads stripping
// ENA upload doesn't do separate lanes, so merge raw FASTQs for mapped-reads removal

// Per-library lane grouping done within process
process lanemerge_stripfastq {
process lanemerge_hostremoval_fastq {
label 'sc_tiny'
tag "${libraryid}"

when:
params.strip_input_fastq
params.hostremoval_input_fastq

input:
tuple samplename, libraryid, lane, colour, seqtype, organism, strandedness, udg, file(r1), file(r2) from ch_input_for_lanemerge_stripfastq.groupTuple(by: [0,1,3,4,5,6,7])
tuple samplename, libraryid, lane, colour, seqtype, organism, strandedness, udg, file(r1), file(r2) from ch_input_for_lanemerge_hostremovalfastq.groupTuple(by: [0,1,3,4,5,6,7])

output:
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*.fq.gz") into ch_fastqlanemerge_for_stripfastq
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.fq.gz") into ch_fastqlanemerge_for_hostremovalfastq

script:
if ( seqtype == 'PE' ){
Expand Down Expand Up @@ -1620,10 +1620,10 @@ process bowtie2 {

// Gather all mapped BAMs from all possible mappers into common channels to send downstream
ch_output_from_bwa.mix(ch_output_from_bwamem, ch_output_from_cm, ch_indexbam_for_filtering, ch_output_from_bt2)
.into { ch_mapping_for_stripfastq; ch_mapping_for_seqtype_merging }
.into { ch_mapping_for_hostremovalfastq; ch_mapping_for_seqtype_merging }

// Synchronise the mapped input FASTQ and input non-remapped BAM channels
ch_fastqlanemerge_for_stripfastq
ch_fastqlanemerge_for_hostremovalfastq
.map {
def samplename = it[0]
def libraryid = it[1]
Expand All @@ -1638,7 +1638,7 @@ ch_fastqlanemerge_for_stripfastq
[ samplename, libraryid, lane, seqtype, organism, strandedness, udg, r1, r2 ]

}
.mix(ch_mapping_for_stripfastq)
.mix(ch_mapping_for_hostremovalfastq)
.groupTuple(by: [0,1,3,4,5,6])
.map {
def samplename = it[0]
Expand All @@ -1657,38 +1657,37 @@ ch_fastqlanemerge_for_stripfastq

}
.filter{ it[8] != null }
.dump(tag: "StripFastq Input")
.set { ch_synced_for_stripfastq }
.set { ch_synced_for_hostremovalfastq }

// Remove mapped reads from original (lane merged) input FASTQ e.g. for sensitive host data when running metagenomic data

process strip_input_fastq {
process hostremoval_input_fastq {
label 'mc_medium'
tag "${libraryid}"
publishDir "${params.outdir}/stripped_fastq", mode: params.publish_dir_mode
publishDir "${params.outdir}/hostremoved_fastq", mode: params.publish_dir_mode

when:
params.strip_input_fastq
params.hostremoval_input_fastq

input:
tuple samplename, libraryid, seqtype, organism, strandedness, udg, path(r1), path(r2), file(bam), file(bai) from ch_synced_for_stripfastq
tuple samplename, libraryid, seqtype, organism, strandedness, udg, file(r1), file(r2), file(bam), file(bai) from ch_synced_for_hostremovalfastq

output:
tuple samplename, libraryid, seqtype, organism, strandedness, udg, file("*.fq.gz") into ch_output_from_stripfastq
tuple samplename, libraryid, seqtype, organism, strandedness, udg, file("*.fq.gz") into ch_output_from_hostremovalfastq

script:
if ( seqtype == 'SE' ) {
out_fwd = bam.baseName+'.stripped.fq.gz'
out_fwd = bam.baseName+'.hostremoved.fq.gz'
"""
samtools index $bam
extract_map_reads.py $bam ${r1} -m ${params.strip_mode} -of $out_fwd -p ${task.cpus}
extract_map_reads.py $bam ${r1} -m ${params.hostremoval_mode} -of $out_fwd -p ${task.cpus}
"""
} else {
out_fwd = bam.baseName+'.stripped.fwd.fq.gz'
out_rev = bam.baseName+'.stripped.rev.fq.gz'
out_fwd = bam.baseName+'.hostremoved.fwd.fq.gz'
out_rev = bam.baseName+'.hostremoved.rev.fq.gz'
"""
samtools index $bam
extract_map_reads.py $bam ${r1} -rev ${r2} -m ${params.strip_mode} -of $out_fwd -or $out_rev -p ${task.cpus}
extract_map_reads.py $bam ${r1} -rev ${r2} -m ${params.hostremoval_mode} -of $out_fwd -or $out_rev -p ${task.cpus}
"""
}

Expand Down
8 changes: 4 additions & 4 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,9 @@ params {
bt2_trim5 = 0
bt2_trim3 = 0

//Mapped read stripping from input FASTQ
strip_input_fastq = false
strip_mode = 'strip'
//Mapped read removal from input FASTQ
hostremoval_input_fastq = false
hostremoval_mode = 'remove'

//BAM Filtering steps (default = discard unmapped reads)
run_bam_filtering = false
Expand Down Expand Up @@ -117,7 +117,7 @@ params {
bamutils_clip_half_udg_right = 1
bamutils_clip_none_udg_left = 1
bamutils_clip_none_udg_right = 1
bamutils_softclip = false
bamutils_softclip = false

//Genotyping options
run_genotyping = false
Expand Down
18 changes: 9 additions & 9 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -570,24 +570,24 @@
},
"fa_icon": "fas fa-layer-group"
},
"stripping": {
"title": "Stripping",
"host_removal": {
"title": "Removal of Host-Mapped Reads",
"type": "object",
"description": "Options for production of host-read removed FASTQ files for privacy reasons.",
"default": "",
"properties": {
"strip_input_fastq": {
"hostremoval_input_fastq": {
"type": "boolean",
"description": "Turn on per-library creation pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data)",
"fa_icon": "fas fa-power-off",
"help_text": "Create pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data)\n"
},
"strip_mode": {
"hostremoval_mode": {
"type": "string",
"default": "strip",
"description": "Stripping mode. Remove mapped reads completely from FASTQ (strip) or just mask mapped reads sequence by N (replace).",
"default": "remove",
"description": "Host removal mode. Remove mapped reads completely from FASTQ (remove) or just mask mapped reads sequence by N (replace).",
"fa_icon": "fas fa-mask",
"help_text": "Read removal mode. Strip mapped reads completely (`'strip'`) or just replace mapped reads sequence by N (`'replace'`)\n"
"help_text": "Read removal mode. Remove mapped reads completely (`'remove'`) or just replace mapped reads sequence by N (`'replace'`)\n"
}
},
"fa_icon": "fas fa-user-shield"
Expand Down Expand Up @@ -1374,7 +1374,7 @@
"$ref": "#/definitions/mapping"
},
{
"$ref": "#/definitions/stripping"
"$ref": "#/definitions/host_removal"
},
{
"$ref": "#/definitions/bam_filtering"
Expand Down Expand Up @@ -1419,4 +1419,4 @@
"$ref": "#/definitions/metagenomic_authentication"
}
]
}
}