Merge pull request #765 from nf-core/inline-barcode-trimming

jfy133 · web-flow · commit 2519dcb9509c · 2021-08-23T14:09:31.000+02:00
Add basic functionality for barcode trimming/fastq trimming
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -59,7 +59,7 @@ jobs:
           git clone --single-branch --branch eager https://github.com/nf-core/test-datasets.git data
       - name: DELAY to try address some odd behaviour with what appears to be a conflict between parallel htslib jobs leading to CI hangs
         run: | 
-          if [[ $NXF_VER = '' ]]; then sleep 360; fi
+          if [[ $NXF_VER = '' ]]; then sleep 1200; fi
       - name: BASIC Run the basic pipeline with directly supplied single-end FASTQ
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test,docker --input 'data/testdata/Mammoth/fastq/*_R1_*.fq.gz' --single_end
@@ -108,6 +108,12 @@ jobs:
       - name: ADAPTER LIST Run the basic pipeline using an adapter list, skipping adapter removal
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --clip_adapters_list 'https://github.com/nf-core/test-datasets/raw/eager/databases/adapters/adapter-list.txt' --skip_adapterremoval 
+      - name: POST_AR_FASTQ_TRIMMING Run the basic pipeline post-adapterremoval FASTQ trimming
+        run: |
+          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_post_ar_trimming
+      - name: POST_AR_FASTQ_TRIMMING Run the basic pipeline post-adapterremoval FASTQ trimming, but skip adapterremoval
+        run: |
+          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_post_ar_trimming --skip_adapterremoval
       - name: MAPPER_CIRCULARMAPPER Test running with CircularMapper
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --mapper 'circularmapper' --circulartarget 'NC_007596.2'
@@ -203,4 +209,4 @@ jobs:
           nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_humanbam,docker --skip_fastqc --skip_adapterremoval --skip_deduplication --skip_qualimap --skip_preseq --skip_damage_calculation --run_mtnucratio
       - name: RESCALING Run basic pipeline with basic pipeline but with mapDamage rescaling of BAM files. Note this will be slow
         run: |
-          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_mapdamage_rescaling --run_genotyping --genotyping_tool hc --genotyping_source 'rescaled'
+          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_mapdamage_rescaling --run_genotyping --genotyping_tool hc --genotyping_source 'rescaled'
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 - [#651](https://github.com/nf-core/eager/issues/651) - Adds removal of adapters specified in an AdapterRemoval adapter list file
 - [#769](https://github.com/nf-core/eager/issues/769) - Adds lc_extrap mode to preseq (requested by @roberta-davidson)
+- [#642](https://github.com/nf-core/eager/issues/642) and [#431](https://github.com/nf-core/eager/issues/431) adds post-adapter removal barcode/fastq trimming
 
 ### `Fixed`
 
diff --git a/README.md b/README.md
@@ -66,7 +66,7 @@ By default the pipeline currently performs the following:
 
 * Create reference genome indices for mapping (`bwa`, `samtools`, and `picard`)
 * Sequencing quality control (`FastQC`)
-* Sequencing adapter removal and for paired end data merging (`AdapterRemoval`)
+* Sequencing adapter removal, paired-end data merging (`AdapterRemoval`)
 * Read mapping to reference using (`bwa aln`, `bwa mem`, `CircularMapper`, or `bowtie2`)
 * Post-mapping processing, statistics and conversion to bam (`samtools`)
 * Ancient DNA C-to-T damage pattern visualisation (`DamageProfiler`)
@@ -86,6 +86,7 @@ Additional functionality contained by the pipeline currently includes:
 #### Preprocessing
 
 * Illumina two-coloured sequencer poly-G tail removal (`fastp`)
+* Post-AdapterRemoval trimming of FASTQ files prior mapping (`fastp`)
 * Automatic conversion of unmapped reads to FASTQ (`samtools`)
 * Host DNA (mapped reads) stripping from input FASTQ files (for sensitive samples)
 
diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml
@@ -60,13 +60,13 @@ extra_fn_clean_exts:
 
 top_modules:
     - 'fastqc':
-       name: 'FastQC (pre-AdapterRemoval)'
+       name: 'FastQC (pre-Trimming)'
        path_filters:
            - '*_raw_fastqc.zip'
     - 'fastp'
     - 'adapterRemoval'
     - 'fastqc':
-       name: 'FastQC (post-AdapterRemoval)'
+       name: 'FastQC (post-Trimming)'
        path_filters:
             - '*.truncated_fastqc.zip'
             - '*.combined*_fastqc.zip'
@@ -108,7 +108,7 @@ remove_sections:
   - sexdeterrmine-snps
 
 table_columns_visible:
-    FastQC (pre-AdapterRemoval):
+    FastQC (pre-Trimming):
         percent_duplicates: False
         percent_gc: True
         avg_sequence_length: True
@@ -119,7 +119,7 @@ table_columns_visible:
     Adapter Removal:
         aligned_total: False
         percent_aligned: True
-    FastQC (post-AdapterRemoval):
+    FastQC (post-Trimming):
         avg_sequence_length: True
         percent_duplicates: False
         total_sequences: True
@@ -182,15 +182,15 @@ table_columns_visible:
         Total_Snps: False
 
 table_columns_placement:
-    FastQC (pre-AdapterRemoval):
+    FastQC (pre-Trimming):
         total_sequences: 100
         avg_sequence_length: 110
         percent_gc: 120
     fastp:
         after_filtering_gc_content: 200
     Adapter Removal:
         percent_aligned: 300
-    FastQC (post-AdapterRemoval): 
+    FastQC (post-Trimming): 
         total_sequences: 400
         avg_sequence_length: 410
         percent_gc: 420
diff --git a/docs/output.md b/docs/output.md
@@ -112,7 +112,8 @@ When dealing with ancient DNA data the MultiQC plots for FastQC will often show
 
 For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/).
 
-> **NB:** The FastQC (pre-AdapterRemoval) plots displayed in the MultiQC report shows *untrimmed* reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the FastQC (post-AdapterRemoval). You should expect after AdapterRemoval, that most of the artefacts are removed.
+> **NB:** The FastQC (pre-Trimming) plots displayed in the MultiQC report shows *untrimmed* reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the FastQC (post-Trimming) section. You should expect after AdapterRemoval, that most of the artefacts are removed.
+> :warning: If you turned on `--post_ar_fastq_trimming` your 'post-Trimming' report the statistics _after_ this trimming. There is no separate report for the post-AdapterRemoval trimming.
 
 #### Sequence Counts
 
@@ -648,7 +649,8 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir
 
 * `reference_genome/`: this directory contains the indexing files  of your input reference genome (i.e. the various `bwa` indices, a `samtools`' `.fai` file, and a picard `.dict`), if you used the `--saveReference` flag.
 * `fastqc/`: this contains the original per-FASTQ FastQC reports that are summarised with MultiQC. These occur in both `html` (the report) and `.zip` format (raw data). The `after_clipping` folder contains the same but for after AdapterRemoval.
-* `adapterremoval/`: this contains the log files (ending with `.settings`) with raw trimming (and merging) statistics after AdapterRemoval. In the `output` sub-directory, are the output trimmed (and merged) FASTQ files. These you can use for downstream applications such as taxonomic binning for metagenomic studies.
+* `adapterremoval/`: this contains the log files (ending with `.settings`) with raw trimming (and merging) statistics after AdapterRemoval. In the `output` sub-directory, are the output trimmed (and merged) `fastq` files. These you can use for downstream applications such as taxonomic binning for metagenomic studies.
+* `post_ar_fastq_trimmed`: this contains `fastq` files that have been additionally trimmed after AdapterRemoval (if turned on). These reads are usually that had internal barcodes, or damage that needed to be removed before mapping.
 * `mapping/`: this contains a sub-directory corresponding to the mapping tool you used, inside of which will be the initial BAM files containing the reads that mapped to your reference genome with no modification (see below). You will also find a corresponding BAM index file (ending in `.csi` or `.bam`), and if running the `bowtie2` mapper: a log ending in `_bt2.log`. You can use these for downstream applications e.g. if you wish to use a different de-duplication tool not included in nf-core/eager (although please feel free to add a new module request on the Github repository's [issue page](https://github.com/nf-core/eager/issues)!).
 * `samtools/`: this contains two sub-directories. `stats/` contain the raw mapping statistics files (ending in `.stats`) from directly after mapping. `filter/` contains BAM files that have had a mapping quality filter applied (set by the `--bam_mapping_quality_threshold` flag) and a corresponding index file. Furthermore, if you selected `--bam_discard_unmapped`, you will find your separate file with only unmapped reads in the format you selected. Note unmapped read BAM files will _not_ have an index file.
 * `deduplication/`: this contains a sub-directory called `dedup/`, inside here are sample specific directories. Each directory contains a BAM file containing mapped reads but with PCR duplicates removed, a corresponding index file and two stats file. `.hist.` contains raw data for a deduplication histogram used for tools like preseq (see below), and the `.log` contains overall summary deduplication statistics.
diff --git a/main.nf b/main.nf
@@ -950,16 +950,99 @@ if ( params.skip_collapse ){
 // AdapterRemoval bypass when not running it
 if (!params.skip_adapterremoval) {
     ch_output_from_adapterremoval.mix(ch_fastp_for_skipadapterremoval)
+        .dump(tag: "post_ar_adapterremoval_decision_skipar")
         .filter { it =~/.*combined.fq.gz|.*truncated.gz/ }
-        .dump(tag: "AR Bypass")
-        .into { ch_adapterremoval_for_fastqc_after_clipping; ch_adapterremoval_for_lanemerge; } 
+        .dump(tag: "ar_bypass")
+        .into { ch_adapterremoval_for_post_ar_trimming; ch_adapterremoval_for_skip_post_ar_trimming; } 
 } else {
     ch_fastp_for_skipadapterremoval
-        .into { ch_adapterremoval_for_fastqc_after_clipping; ch_adapterremoval_for_lanemerge; } 
+        .dump(tag: "post_ar_adapterremoval_decision_withar")
+        .into { ch_adapterremoval_for_post_ar_trimming; ch_adapterremoval_for_skip_post_ar_trimming; } 
+}
+
+// Post AR fastq trimming
+
+process post_ar_fastq_trimming {
+  label 'mc_small'
+  tag "${libraryid}"
+  publishDir "${params.outdir}/post_ar_fastq_trimmed", mode: params.publish_dir_mode
+
+  when: params.run_post_ar_trimming
+
+  input:
+  tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(r1), path(r2) from ch_adapterremoval_for_post_ar_trimming
+
+  output:
+  tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*_R1_postartrimmed.fq.gz") into ch_post_ar_trimming_for_lanemerge_r1
+  tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*_R2_postartrimmed.fq.gz") optional true into ch_post_ar_trimming_for_lanemerge_r2
+
+  script:
+  if ( seqtype == 'SE' | (seqtype == 'PE' && !params.skip_collapse) ) {
+  """
+  fastp --in1 ${r1} --trim_front1 ${params.post_ar_trim_front} --trim_tail1 ${params.post_ar_trim_tail} -A -G -Q -L -w ${task.cpus} --out1 "${libraryid}"_L"${lane}"_R1_postartrimmed.fq.gz
+  """
+  } else if ( seqtype == 'PE' && params.skip_collapse ) {
+  """
+  fastp --in1 ${r1} --in2 ${r2}  --trim_front1 ${params.post_ar_trim_front} --trim_tail1 ${params.post_ar_trim_tail} --trim_front2 ${params.post_ar_trim_front2} --trim_tail2 ${params.post_ar_trim_tail2} -A -G -Q -L -w ${task.cpus} --out1 "${libraryid}"_L"${lane}"_R1_postartrimmed.fq.gz --out2 "${libraryid}"_L"${lane}"_R2_postartrimmed.fq.gz
+  """
+  }
+
+}
+
+// When not collapsing paired-end data, re-merge the R1 and R2 files into single map. Otherwise if SE or collapsed PE, R2 now becomes NA
+// Sort to make sure we get consistent R1 and R2 ordered when using `-resume`, even if not needed for FastQC
+if ( params.skip_collapse ){
+  ch_post_ar_trimming_for_lanemerge_r1
+    .mix(ch_post_ar_trimming_for_lanemerge_r2)
+    .groupTuple(by: [0,1,2,3,4,5,6])
+    .map{
+      it -> 
+        def samplename = it[0]
+        def libraryid  = it[1]
+        def lane = it[2]
+        def seqtype = it[3]
+        def organism = it[4]
+        def strandedness = it[5]
+        def udg = it[6]
+        def r1 = file(it[7].sort()[0])
+        def r2 = seqtype == "PE" ? file(it[7].sort()[1]) : file("$projectDir/assets/nf-core_eager_dummy.txt")
+
+        [ samplename, libraryid, lane, seqtype, organism, strandedness, udg, r1, r2 ]
+
+    }
+    .set { ch_post_ar_trimming_for_lanemerge; }
+} else {
+  ch_post_ar_trimming_for_lanemerge_r1
+    .map{
+      it -> 
+        def samplename = it[0]
+        def libraryid  = it[1]
+        def lane = it[2]
+        def seqtype = it[3]
+        def organism = it[4]
+        def strandedness = it[5]
+        def udg = it[6]
+        def r1 = file(it[7])
+        def r2 = file("$projectDir/assets/nf-core_eager_dummy.txt")
+
+        [ samplename, libraryid, lane, seqtype, organism, strandedness, udg, r1, r2 ]
+    }
+    .set { ch_post_ar_trimming_for_lanemerge; }
+}
+
+
+// Inline barcode removal bypass when not running it 
+if (params.run_post_ar_trimming) {
+    ch_adapterremoval_for_skip_post_ar_trimming
+        .dump(tag: "inline_removal_bypass")
+        .into { ch_inlinebarcoderemoval_for_fastqc_after_clipping; ch_inlinebarcoderemoval_for_lanemerge; } 
+} else {
+    ch_adapterremoval_for_skip_post_ar_trimming
+        .into { ch_inlinebarcoderemoval_for_fastqc_after_clipping; ch_inlinebarcoderemoval_for_lanemerge; } 
 }
 
 // Lane merging for libraries sequenced over multiple lanes (e.g. NextSeq)
-ch_branched_for_lanemerge = ch_adapterremoval_for_lanemerge
+ch_branched_for_lanemerge = ch_inlinebarcoderemoval_for_lanemerge
   .groupTuple(by: [0,1,3,4,5,6])
   .map {
     it ->
@@ -976,7 +1059,7 @@ ch_branched_for_lanemerge = ch_adapterremoval_for_lanemerge
       [ samplename, libraryid, lane, seqtype, organism, strandedness, udg, r1, r2 ]
 
   }
-  .dump(tag: "LaneMerge Bypass")
+  .dump(tag: "lanemerge_bypass_decision")
   .branch {
     skip_merge: it[7].size() == 1 // Can skip merging if only single lanes
     merge_me: it[7].size() > 1
@@ -997,7 +1080,7 @@ ch_branched_for_lanemerge_skipme = ch_branched_for_lanemerge.skip_merge
 
         [ samplename, libraryid, lane, seqtype, organism, strandedness, udg, r1, r2 ]
   }
-  .dump(tag: "LaneMerge Reconfigure")
+  .dump(tag: "lanemerge_reconfigure")
 
 
 ch_branched_for_lanemerge_ready = ch_branched_for_lanemerge.merge_me
@@ -1025,7 +1108,7 @@ process lanemerge {
   publishDir "${params.outdir}/lanemerging", mode: params.publish_dir_mode
 
   input:
-  tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(r1), path(r2) from ch_branched_for_lanemerge_ready
+  tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(r1), path(r2) from ch_branched_for_lanemerge_ready.dump(tag: "lange_merge_input")
 
   output:
   tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*_R1_lanemerged.fq.gz") into ch_lanemerge_for_mapping_r1
@@ -1049,7 +1132,7 @@ process lanemerge {
 // Ensuring always valid R2 file even if doesn't exist for AWS
 if ( ( params.skip_collapse || params.skip_adapterremoval ) ) {
   ch_lanemerge_for_mapping_r1
-    .dump(tag: "Post LaneMerge Reconfigure")
+    .dump(tag: "post_lanemerge_reconfigure")
     .mix(ch_lanemerge_for_mapping_r2)
     .groupTuple(by: [0,1,2,3,4,5,6])
     .map{
@@ -1120,7 +1203,7 @@ process lanemerge_hostremoval_fastq {
 
 }
 
-// Post-preprocessing QC to help user check pre-processing removed all sequencing artefacts
+// Post-preprocessing QC to help user check pre-processing removed all sequencing artefacts. If doing post-AR trimming includes this step in output.
 
 process fastqc_after_clipping {
     label 'mc_small'
@@ -1134,7 +1217,7 @@ process fastqc_after_clipping {
     when: !params.skip_adapterremoval && !params.skip_fastqc
 
     input:
-    tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(r1), file(r2) from ch_adapterremoval_for_fastqc_after_clipping
+    tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(r1), file(r2) from ch_inlinebarcoderemoval_for_fastqc_after_clipping
 
     output:
     path("*_fastqc.{zip,html}") into ch_fastqc_after_clipping
@@ -1164,7 +1247,7 @@ process bwa {
     publishDir "${params.outdir}/mapping/bwa", mode: params.publish_dir_mode
 
     input:
-    tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(r1), path(r2) from ch_lanemerge_for_bwa.dump(tag: "input_tuple")
+    tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(r1), path(r2) from ch_lanemerge_for_bwa.dump(tag: "bwa_input_reads")
     path index from bwa_index.collect().dump(tag: "input_index")
 
     output:
@@ -1483,7 +1566,7 @@ ch_branched_for_seqtypemerge = ch_mapping_for_seqtype_merging
       [ samplename, libraryid, lane, seqtype_new, organism, strandedness, udg, r1, r2 ]
 
   }
-  .dump(tag: "Seqtype")
+  .dump(tag: "pre_seqtype_decision")
   .branch {
     skip_merge: it[7].size() == 1 // Can skip merging if only single lanes
     merge_me: it[7].size() > 1
@@ -1866,7 +1949,7 @@ process library_merge {
   publishDir "${params.outdir}/merged_bams/initial", mode: params.publish_dir_mode
 
   input:
-  tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(bam), file(bai) from ch_fixedinput_for_librarymerging.dump(tag: "Input Tuple Library Merge")
+  tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(bam), file(bai) from ch_fixedinput_for_librarymerging.dump(tag: "library_merge_input")
 
   output:
   tuple samplename, val("${samplename}_libmerged"), lane, seqtype, organism, strandedness, udg, path("*_libmerged_rg_rmdup.bam"), path("*_libmerged_rg_rmdup.bam.{bai,csi}") into ch_output_from_librarymerging
@@ -2385,7 +2468,7 @@ process genotyping_pileupcaller {
   file fai from ch_fai_for_pileupcaller.collect()
   file dict from ch_dict_for_pileupcaller.collect()
   path(bed) from ch_bed_for_pileupcaller.collect()
-  path(snp) from ch_snp_for_pileupcaller.collect().dump(tag: "Pileupcaller SNP file")
+  path(snp) from ch_snp_for_pileupcaller.collect().dump(tag: "pileupcaller_snp_file")
 
   output:
   tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("pileupcaller.${strandedness}.*") into ch_for_eigenstrat_snp_coverage
diff --git a/nextflow.config b/nextflow.config
@@ -75,6 +75,11 @@ params {
   preserve5p = false
   mergedonly = false
   qualitymax = 41
+  run_post_ar_trimming = false
+  post_ar_trim_front = 7
+  post_ar_trim_tail = 7
+  post_ar_trim_front2 = 7
+  post_ar_trim_tail2 = 7
 
   //Mapping algorithm
   mapper = 'bwaaln'
diff --git a/nextflow_schema.json b/nextflow_schema.json