Skip to content

Commit 2519dcb

Browse files
authored
Merge pull request #765 from nf-core/inline-barcode-trimming
Add basic functionality for barcode trimming/fastq trimming
2 parents 54396a8 + e02bc14 commit 2519dcb

8 files changed

Lines changed: 152 additions & 32 deletions

File tree

.github/workflows/ci.yml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ jobs:
5959
git clone --single-branch --branch eager https://github.com/nf-core/test-datasets.git data
6060
- name: DELAY to try address some odd behaviour with what appears to be a conflict between parallel htslib jobs leading to CI hangs
6161
run: |
62-
if [[ $NXF_VER = '' ]]; then sleep 360; fi
62+
if [[ $NXF_VER = '' ]]; then sleep 1200; fi
6363
- name: BASIC Run the basic pipeline with directly supplied single-end FASTQ
6464
run: |
6565
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --input 'data/testdata/Mammoth/fastq/*_R1_*.fq.gz' --single_end
@@ -108,6 +108,12 @@ jobs:
108108
- name: ADAPTER LIST Run the basic pipeline using an adapter list, skipping adapter removal
109109
run: |
110110
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --clip_adapters_list 'https://github.com/nf-core/test-datasets/raw/eager/databases/adapters/adapter-list.txt' --skip_adapterremoval
111+
- name: POST_AR_FASTQ_TRIMMING Run the basic pipeline post-adapterremoval FASTQ trimming
112+
run: |
113+
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_post_ar_trimming
114+
- name: POST_AR_FASTQ_TRIMMING Run the basic pipeline post-adapterremoval FASTQ trimming, but skip adapterremoval
115+
run: |
116+
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_post_ar_trimming --skip_adapterremoval
111117
- name: MAPPER_CIRCULARMAPPER Test running with CircularMapper
112118
run: |
113119
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --mapper 'circularmapper' --circulartarget 'NC_007596.2'
@@ -203,4 +209,4 @@ jobs:
203209
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_humanbam,docker --skip_fastqc --skip_adapterremoval --skip_deduplication --skip_qualimap --skip_preseq --skip_damage_calculation --run_mtnucratio
204210
- name: RESCALING Run basic pipeline with basic pipeline but with mapDamage rescaling of BAM files. Note this will be slow
205211
run: |
206-
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_mapdamage_rescaling --run_genotyping --genotyping_tool hc --genotyping_source 'rescaled'
212+
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_mapdamage_rescaling --run_genotyping --genotyping_tool hc --genotyping_source 'rescaled'

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
99

1010
- [#651](https://github.com/nf-core/eager/issues/651) - Adds removal of adapters specified in an AdapterRemoval adapter list file
1111
- [#769](https://github.com/nf-core/eager/issues/769) - Adds lc_extrap mode to preseq (requested by @roberta-davidson)
12+
- [#642](https://github.com/nf-core/eager/issues/642) and [#431](https://github.com/nf-core/eager/issues/431) adds post-adapter removal barcode/fastq trimming
1213

1314
### `Fixed`
1415

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ By default the pipeline currently performs the following:
6666

6767
* Create reference genome indices for mapping (`bwa`, `samtools`, and `picard`)
6868
* Sequencing quality control (`FastQC`)
69-
* Sequencing adapter removal and for paired end data merging (`AdapterRemoval`)
69+
* Sequencing adapter removal, paired-end data merging (`AdapterRemoval`)
7070
* Read mapping to reference using (`bwa aln`, `bwa mem`, `CircularMapper`, or `bowtie2`)
7171
* Post-mapping processing, statistics and conversion to bam (`samtools`)
7272
* Ancient DNA C-to-T damage pattern visualisation (`DamageProfiler`)
@@ -86,6 +86,7 @@ Additional functionality contained by the pipeline currently includes:
8686
#### Preprocessing
8787

8888
* Illumina two-coloured sequencer poly-G tail removal (`fastp`)
89+
* Post-AdapterRemoval trimming of FASTQ files prior mapping (`fastp`)
8990
* Automatic conversion of unmapped reads to FASTQ (`samtools`)
9091
* Host DNA (mapped reads) stripping from input FASTQ files (for sensitive samples)
9192

assets/multiqc_config.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,13 @@ extra_fn_clean_exts:
6060

6161
top_modules:
6262
- 'fastqc':
63-
name: 'FastQC (pre-AdapterRemoval)'
63+
name: 'FastQC (pre-Trimming)'
6464
path_filters:
6565
- '*_raw_fastqc.zip'
6666
- 'fastp'
6767
- 'adapterRemoval'
6868
- 'fastqc':
69-
name: 'FastQC (post-AdapterRemoval)'
69+
name: 'FastQC (post-Trimming)'
7070
path_filters:
7171
- '*.truncated_fastqc.zip'
7272
- '*.combined*_fastqc.zip'
@@ -108,7 +108,7 @@ remove_sections:
108108
- sexdeterrmine-snps
109109

110110
table_columns_visible:
111-
FastQC (pre-AdapterRemoval):
111+
FastQC (pre-Trimming):
112112
percent_duplicates: False
113113
percent_gc: True
114114
avg_sequence_length: True
@@ -119,7 +119,7 @@ table_columns_visible:
119119
Adapter Removal:
120120
aligned_total: False
121121
percent_aligned: True
122-
FastQC (post-AdapterRemoval):
122+
FastQC (post-Trimming):
123123
avg_sequence_length: True
124124
percent_duplicates: False
125125
total_sequences: True
@@ -182,15 +182,15 @@ table_columns_visible:
182182
Total_Snps: False
183183

184184
table_columns_placement:
185-
FastQC (pre-AdapterRemoval):
185+
FastQC (pre-Trimming):
186186
total_sequences: 100
187187
avg_sequence_length: 110
188188
percent_gc: 120
189189
fastp:
190190
after_filtering_gc_content: 200
191191
Adapter Removal:
192192
percent_aligned: 300
193-
FastQC (post-AdapterRemoval):
193+
FastQC (post-Trimming):
194194
total_sequences: 400
195195
avg_sequence_length: 410
196196
percent_gc: 420

docs/output.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,8 @@ When dealing with ancient DNA data the MultiQC plots for FastQC will often show
112112

113113
For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/).
114114

115-
> **NB:** The FastQC (pre-AdapterRemoval) plots displayed in the MultiQC report shows *untrimmed* reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the FastQC (post-AdapterRemoval). You should expect after AdapterRemoval, that most of the artefacts are removed.
115+
> **NB:** The FastQC (pre-Trimming) plots displayed in the MultiQC report shows *untrimmed* reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the FastQC (post-Trimming) section. You should expect after AdapterRemoval, that most of the artefacts are removed.
116+
> :warning: If you turned on `--post_ar_fastq_trimming` your 'post-Trimming' report the statistics _after_ this trimming. There is no separate report for the post-AdapterRemoval trimming.
116117
117118
#### Sequence Counts
118119

@@ -648,7 +649,8 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir
648649

649650
* `reference_genome/`: this directory contains the indexing files of your input reference genome (i.e. the various `bwa` indices, a `samtools`' `.fai` file, and a picard `.dict`), if you used the `--saveReference` flag.
650651
* `fastqc/`: this contains the original per-FASTQ FastQC reports that are summarised with MultiQC. These occur in both `html` (the report) and `.zip` format (raw data). The `after_clipping` folder contains the same but for after AdapterRemoval.
651-
* `adapterremoval/`: this contains the log files (ending with `.settings`) with raw trimming (and merging) statistics after AdapterRemoval. In the `output` sub-directory, are the output trimmed (and merged) FASTQ files. These you can use for downstream applications such as taxonomic binning for metagenomic studies.
652+
* `adapterremoval/`: this contains the log files (ending with `.settings`) with raw trimming (and merging) statistics after AdapterRemoval. In the `output` sub-directory, are the output trimmed (and merged) `fastq` files. These you can use for downstream applications such as taxonomic binning for metagenomic studies.
653+
* `post_ar_fastq_trimmed`: this contains `fastq` files that have been additionally trimmed after AdapterRemoval (if turned on). These reads are usually that had internal barcodes, or damage that needed to be removed before mapping.
652654
* `mapping/`: this contains a sub-directory corresponding to the mapping tool you used, inside of which will be the initial BAM files containing the reads that mapped to your reference genome with no modification (see below). You will also find a corresponding BAM index file (ending in `.csi` or `.bam`), and if running the `bowtie2` mapper: a log ending in `_bt2.log`. You can use these for downstream applications e.g. if you wish to use a different de-duplication tool not included in nf-core/eager (although please feel free to add a new module request on the Github repository's [issue page](https://github.com/nf-core/eager/issues)!).
653655
* `samtools/`: this contains two sub-directories. `stats/` contain the raw mapping statistics files (ending in `.stats`) from directly after mapping. `filter/` contains BAM files that have had a mapping quality filter applied (set by the `--bam_mapping_quality_threshold` flag) and a corresponding index file. Furthermore, if you selected `--bam_discard_unmapped`, you will find your separate file with only unmapped reads in the format you selected. Note unmapped read BAM files will _not_ have an index file.
654656
* `deduplication/`: this contains a sub-directory called `dedup/`, inside here are sample specific directories. Each directory contains a BAM file containing mapped reads but with PCR duplicates removed, a corresponding index file and two stats file. `.hist.` contains raw data for a deduplication histogram used for tools like preseq (see below), and the `.log` contains overall summary deduplication statistics.

main.nf

Lines changed: 97 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -950,16 +950,99 @@ if ( params.skip_collapse ){
950950
// AdapterRemoval bypass when not running it
951951
if (!params.skip_adapterremoval) {
952952
ch_output_from_adapterremoval.mix(ch_fastp_for_skipadapterremoval)
953+
.dump(tag: "post_ar_adapterremoval_decision_skipar")
953954
.filter { it =~/.*combined.fq.gz|.*truncated.gz/ }
954-
.dump(tag: "AR Bypass")
955-
.into { ch_adapterremoval_for_fastqc_after_clipping; ch_adapterremoval_for_lanemerge; }
955+
.dump(tag: "ar_bypass")
956+
.into { ch_adapterremoval_for_post_ar_trimming; ch_adapterremoval_for_skip_post_ar_trimming; }
956957
} else {
957958
ch_fastp_for_skipadapterremoval
958-
.into { ch_adapterremoval_for_fastqc_after_clipping; ch_adapterremoval_for_lanemerge; }
959+
.dump(tag: "post_ar_adapterremoval_decision_withar")
960+
.into { ch_adapterremoval_for_post_ar_trimming; ch_adapterremoval_for_skip_post_ar_trimming; }
961+
}
962+
963+
// Post AR fastq trimming
964+
965+
process post_ar_fastq_trimming {
966+
label 'mc_small'
967+
tag "${libraryid}"
968+
publishDir "${params.outdir}/post_ar_fastq_trimmed", mode: params.publish_dir_mode
969+
970+
when: params.run_post_ar_trimming
971+
972+
input:
973+
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(r1), path(r2) from ch_adapterremoval_for_post_ar_trimming
974+
975+
output:
976+
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*_R1_postartrimmed.fq.gz") into ch_post_ar_trimming_for_lanemerge_r1
977+
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*_R2_postartrimmed.fq.gz") optional true into ch_post_ar_trimming_for_lanemerge_r2
978+
979+
script:
980+
if ( seqtype == 'SE' | (seqtype == 'PE' && !params.skip_collapse) ) {
981+
"""
982+
fastp --in1 ${r1} --trim_front1 ${params.post_ar_trim_front} --trim_tail1 ${params.post_ar_trim_tail} -A -G -Q -L -w ${task.cpus} --out1 "${libraryid}"_L"${lane}"_R1_postartrimmed.fq.gz
983+
"""
984+
} else if ( seqtype == 'PE' && params.skip_collapse ) {
985+
"""
986+
fastp --in1 ${r1} --in2 ${r2} --trim_front1 ${params.post_ar_trim_front} --trim_tail1 ${params.post_ar_trim_tail} --trim_front2 ${params.post_ar_trim_front2} --trim_tail2 ${params.post_ar_trim_tail2} -A -G -Q -L -w ${task.cpus} --out1 "${libraryid}"_L"${lane}"_R1_postartrimmed.fq.gz --out2 "${libraryid}"_L"${lane}"_R2_postartrimmed.fq.gz
987+
"""
988+
}
989+
990+
}
991+
992+
// When not collapsing paired-end data, re-merge the R1 and R2 files into single map. Otherwise if SE or collapsed PE, R2 now becomes NA
993+
// Sort to make sure we get consistent R1 and R2 ordered when using `-resume`, even if not needed for FastQC
994+
if ( params.skip_collapse ){
995+
ch_post_ar_trimming_for_lanemerge_r1
996+
.mix(ch_post_ar_trimming_for_lanemerge_r2)
997+
.groupTuple(by: [0,1,2,3,4,5,6])
998+
.map{
999+
it ->
1000+
def samplename = it[0]
1001+
def libraryid = it[1]
1002+
def lane = it[2]
1003+
def seqtype = it[3]
1004+
def organism = it[4]
1005+
def strandedness = it[5]
1006+
def udg = it[6]
1007+
def r1 = file(it[7].sort()[0])
1008+
def r2 = seqtype == "PE" ? file(it[7].sort()[1]) : file("$projectDir/assets/nf-core_eager_dummy.txt")
1009+
1010+
[ samplename, libraryid, lane, seqtype, organism, strandedness, udg, r1, r2 ]
1011+
1012+
}
1013+
.set { ch_post_ar_trimming_for_lanemerge; }
1014+
} else {
1015+
ch_post_ar_trimming_for_lanemerge_r1
1016+
.map{
1017+
it ->
1018+
def samplename = it[0]
1019+
def libraryid = it[1]
1020+
def lane = it[2]
1021+
def seqtype = it[3]
1022+
def organism = it[4]
1023+
def strandedness = it[5]
1024+
def udg = it[6]
1025+
def r1 = file(it[7])
1026+
def r2 = file("$projectDir/assets/nf-core_eager_dummy.txt")
1027+
1028+
[ samplename, libraryid, lane, seqtype, organism, strandedness, udg, r1, r2 ]
1029+
}
1030+
.set { ch_post_ar_trimming_for_lanemerge; }
1031+
}
1032+
1033+
1034+
// Inline barcode removal bypass when not running it
1035+
if (params.run_post_ar_trimming) {
1036+
ch_adapterremoval_for_skip_post_ar_trimming
1037+
.dump(tag: "inline_removal_bypass")
1038+
.into { ch_inlinebarcoderemoval_for_fastqc_after_clipping; ch_inlinebarcoderemoval_for_lanemerge; }
1039+
} else {
1040+
ch_adapterremoval_for_skip_post_ar_trimming
1041+
.into { ch_inlinebarcoderemoval_for_fastqc_after_clipping; ch_inlinebarcoderemoval_for_lanemerge; }
9591042
}
9601043

9611044
// Lane merging for libraries sequenced over multiple lanes (e.g. NextSeq)
962-
ch_branched_for_lanemerge = ch_adapterremoval_for_lanemerge
1045+
ch_branched_for_lanemerge = ch_inlinebarcoderemoval_for_lanemerge
9631046
.groupTuple(by: [0,1,3,4,5,6])
9641047
.map {
9651048
it ->
@@ -976,7 +1059,7 @@ ch_branched_for_lanemerge = ch_adapterremoval_for_lanemerge
9761059
[ samplename, libraryid, lane, seqtype, organism, strandedness, udg, r1, r2 ]
9771060

9781061
}
979-
.dump(tag: "LaneMerge Bypass")
1062+
.dump(tag: "lanemerge_bypass_decision")
9801063
.branch {
9811064
skip_merge: it[7].size() == 1 // Can skip merging if only single lanes
9821065
merge_me: it[7].size() > 1
@@ -997,7 +1080,7 @@ ch_branched_for_lanemerge_skipme = ch_branched_for_lanemerge.skip_merge
9971080

9981081
[ samplename, libraryid, lane, seqtype, organism, strandedness, udg, r1, r2 ]
9991082
}
1000-
.dump(tag: "LaneMerge Reconfigure")
1083+
.dump(tag: "lanemerge_reconfigure")
10011084

10021085

10031086
ch_branched_for_lanemerge_ready = ch_branched_for_lanemerge.merge_me
@@ -1025,7 +1108,7 @@ process lanemerge {
10251108
publishDir "${params.outdir}/lanemerging", mode: params.publish_dir_mode
10261109

10271110
input:
1028-
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(r1), path(r2) from ch_branched_for_lanemerge_ready
1111+
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(r1), path(r2) from ch_branched_for_lanemerge_ready.dump(tag: "lange_merge_input")
10291112

10301113
output:
10311114
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*_R1_lanemerged.fq.gz") into ch_lanemerge_for_mapping_r1
@@ -1049,7 +1132,7 @@ process lanemerge {
10491132
// Ensuring always valid R2 file even if doesn't exist for AWS
10501133
if ( ( params.skip_collapse || params.skip_adapterremoval ) ) {
10511134
ch_lanemerge_for_mapping_r1
1052-
.dump(tag: "Post LaneMerge Reconfigure")
1135+
.dump(tag: "post_lanemerge_reconfigure")
10531136
.mix(ch_lanemerge_for_mapping_r2)
10541137
.groupTuple(by: [0,1,2,3,4,5,6])
10551138
.map{
@@ -1120,7 +1203,7 @@ process lanemerge_hostremoval_fastq {
11201203

11211204
}
11221205

1123-
// Post-preprocessing QC to help user check pre-processing removed all sequencing artefacts
1206+
// Post-preprocessing QC to help user check pre-processing removed all sequencing artefacts. If doing post-AR trimming includes this step in output.
11241207

11251208
process fastqc_after_clipping {
11261209
label 'mc_small'
@@ -1134,7 +1217,7 @@ process fastqc_after_clipping {
11341217
when: !params.skip_adapterremoval && !params.skip_fastqc
11351218

11361219
input:
1137-
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(r1), file(r2) from ch_adapterremoval_for_fastqc_after_clipping
1220+
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(r1), file(r2) from ch_inlinebarcoderemoval_for_fastqc_after_clipping
11381221

11391222
output:
11401223
path("*_fastqc.{zip,html}") into ch_fastqc_after_clipping
@@ -1164,7 +1247,7 @@ process bwa {
11641247
publishDir "${params.outdir}/mapping/bwa", mode: params.publish_dir_mode
11651248

11661249
input:
1167-
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(r1), path(r2) from ch_lanemerge_for_bwa.dump(tag: "input_tuple")
1250+
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(r1), path(r2) from ch_lanemerge_for_bwa.dump(tag: "bwa_input_reads")
11681251
path index from bwa_index.collect().dump(tag: "input_index")
11691252

11701253
output:
@@ -1483,7 +1566,7 @@ ch_branched_for_seqtypemerge = ch_mapping_for_seqtype_merging
14831566
[ samplename, libraryid, lane, seqtype_new, organism, strandedness, udg, r1, r2 ]
14841567

14851568
}
1486-
.dump(tag: "Seqtype")
1569+
.dump(tag: "pre_seqtype_decision")
14871570
.branch {
14881571
skip_merge: it[7].size() == 1 // Can skip merging if only single lanes
14891572
merge_me: it[7].size() > 1
@@ -1866,7 +1949,7 @@ process library_merge {
18661949
publishDir "${params.outdir}/merged_bams/initial", mode: params.publish_dir_mode
18671950

18681951
input:
1869-
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(bam), file(bai) from ch_fixedinput_for_librarymerging.dump(tag: "Input Tuple Library Merge")
1952+
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(bam), file(bai) from ch_fixedinput_for_librarymerging.dump(tag: "library_merge_input")
18701953

18711954
output:
18721955
tuple samplename, val("${samplename}_libmerged"), lane, seqtype, organism, strandedness, udg, path("*_libmerged_rg_rmdup.bam"), path("*_libmerged_rg_rmdup.bam.{bai,csi}") into ch_output_from_librarymerging
@@ -2385,7 +2468,7 @@ process genotyping_pileupcaller {
23852468
file fai from ch_fai_for_pileupcaller.collect()
23862469
file dict from ch_dict_for_pileupcaller.collect()
23872470
path(bed) from ch_bed_for_pileupcaller.collect()
2388-
path(snp) from ch_snp_for_pileupcaller.collect().dump(tag: "Pileupcaller SNP file")
2471+
path(snp) from ch_snp_for_pileupcaller.collect().dump(tag: "pileupcaller_snp_file")
23892472

23902473
output:
23912474
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("pileupcaller.${strandedness}.*") into ch_for_eigenstrat_snp_coverage

nextflow.config

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,11 @@ params {
7575
preserve5p = false
7676
mergedonly = false
7777
qualitymax = 41
78+
run_post_ar_trimming = false
79+
post_ar_trim_front = 7
80+
post_ar_trim_tail = 7
81+
post_ar_trim_front2 = 7
82+
post_ar_trim_tail2 = 7
7883

7984
//Mapping algorithm
8085
mapper = 'bwaaln'

0 commit comments

Comments
 (0)