Merge pull request #220 from jfy133/postmapfilter-stats

apeltzer · web-flow · commit 69c23735fe03 · 2019-06-09T11:45:33.000+02:00
Added post-mapping filter statistics generation
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,7 +14,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 /(https://github.com/nf-core/eager/issues/182)
 * Merged in [nf-core/tools](https://github.com/nf-core/tools) release V1.6 template changes  
 * A lot more automated tests using Travis CI
-* Don't ignore DamageProfiler errors anymore
+* Don't ignore DamageProfiler errors anymore 
+* [#220](https://github.com/nf-core/eager/pull/220) - Added post-mapping filtering statistics module and corresponding MultiQC statistics [#217](https://github.com/nf-core/eager/issues/217)
+
 
 ### `Fixed`
 
diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml
@@ -10,11 +10,18 @@ top_modules:
      - 'fastp'
      - 'adapterRemoval'
      - 'fastqc':
-         name: 'FastQC (post-AdapterRemoval)'
-         path_filters:
-             - '*.truncated_fastqc.zip'
-             - '*.combined*_fastqc.zip'
-     - 'samtools'
+        name: 'FastQC (post-AdapterRemoval)'
+        path_filters:
+            - '*.truncated_fastqc.zip'
+            - '*.combined*_fastqc.zip'
+     - 'samtools':
+        name: 'Samtools Flagstat (pre-samtools filter)'
+        path_filters:
+             - '*.sorted.stats'
+     - 'samtools':
+        name: 'Samtools Flagstat (post-samtools filter)'
+        path_filters:
+             - '*.sorted.bam.filtered.stats'
      - 'dedup'
      - 'preseq'
      - 'qualimap'
@@ -43,11 +50,15 @@ table_columns_visible:
         percent_duplicates: False
         total_sequences: True
         percent_gc: True
+    Samtools Flagstat (pre-samtools filter):
+        mapped_passed: True
+    Samtools Flagstat (post-samtools filter):
+        mapped_passed: True
     QualiMap:
         1_x_pc: True
         5_x_pc: True
         percentage_aligned: False
-    DamageProfiler::
+    DamageProfiler:
         3 Prime1: True
         3 Prime2: True
         5 Prime1: True
@@ -67,9 +78,11 @@ table_columns_placement:
         total_sequences: 400
         avg_sequence_length: 410
         percent_gc: 420
-    Samtools Flagstat:
+    Samtools Flagstat (pre-samtools filter):
         mapped_passed: 500
-    DeDup: 
+    Samtools Flagstat (post-samtools filter):
+        mapped_passed: 510
+    DeDup:
         clusterfactor: 600        
         duplication_rate: 610
     QualiMap:
@@ -84,7 +97,7 @@ table_columns_placement:
         mtreads: 800
         mt_cov_avg: 810
         mt_nuc_ratio: 820
-    DamageProfiler::
+    DamageProfiler:
         3 Prime1: 900
         3 Prime2: 910
         5 Prime1: 920
diff --git a/docs/output.md b/docs/output.md
@@ -72,6 +72,7 @@ The default columns are as follows:
   * **Seqs** This is from Post-AdapterRemoval FastQC. Represents the number of preprocessed reads in your adapter trimmed (paired end) merged FASTQ file. The loss between this number and the Pre-AdapterRemoval FastQC can give you an idea of the quality of trimming and merging.
   * **%GC** This is from Post-AdapterRemoval FastQC. Represents the average GC of all preprocessed reads in your adapter trimmed (paired end) merged FASTQ file.
   * **Reads Mapped** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _prior_ map quality filtering and deduplication.
+  * **Reads Mapped** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _after_ map quality filtering and deduplication (note the column name does not distinguish itself from prior-map quality filtering, but the post-filter column is always second)
   * **Duplication Rate** This is from DeDup. This is the percentage of overall number of mapped reads that were an exact duplicate of another read. The number of reads removed by DeDup can be calculating this number by mapped reads (if no map quality filtering was applied!)
   * **Coverage** This is from Qualimap. This is the median number of times a base on your reference genome was covered by a read (i.e. depth coverage).. This average includes bases with 0 reads covering that position.
   * **>= 1X** to **>= 5X** These are from Qualimap. This is the percentage of the genome covered at that particular depth coverage.
@@ -233,6 +234,9 @@ The third row 'Mapped' represents the number of reads that found a place that co
 
 The remaining rows will be 0 when running `bwa aln` as these characteristucs of the data are not considered by the algorithm by default.
 
+> **NB:** The Samtools (pre-samtools filter) plots displayed in the MultiQC report shows mapped reads without mapping quality filtering. This will contain reads that can map to multiple places on your reference genome with equal or slightly less mapping quality score. To see how your reads look after mapping quality, look at the FastQC reports in the Samtools (pre-samtools filter). You should expect after mapping quality filtering, that you will have less reads.
+
+
 ### DeDup
 #### Background
 DeDup is a duplicate removal tool which searchs for PCR duplicates and removes them from your BAM file. We remove these duplicates because otherwise you would be artificially increasing your coverage and subsequently confidence in genotyping, by considering these lab artefacts which are not biologically meaningful. DeDup looks for reads with the same start and end coordinates, and whether they have exactly the same sequence. The main difference of DeDup versus e.g. `samtools markduplicates` is that DeDup considers _both_ ends of a read, not just the start position, so it is more precise in removing actual duplicates without penalising often already low aDNA data.
diff --git a/main.nf b/main.nf
@@ -443,7 +443,7 @@ ${summary.collect { k,v -> "            <dt>$k</dt><dd><samp>${v ?: '<span style
 
 
 /* 
-* Create BWA indices if they are not present
+* PREPROCESSING - Create BWA indices if they are not present
 */ 
 
 if(!params.bwa_index && !params.fasta.isEmpty() && (params.aligner == 'bwa' || params.bwamem)){
@@ -528,7 +528,7 @@ process makeSeqDict {
 }
 
 /*
-* Convert BAM to FastQ if BAM input is specified instead of FastQ file(s)
+* PREPROCESSING - Convert BAM to FastQ if BAM input is specified instead of FastQ file(s)
 *
 */ 
 
@@ -554,7 +554,7 @@ process convertBam {
 
 
 /*
- * STEP 1 - FastQC
+ * STEP 1a - FastQC
  */
 process fastqc {
     tag "$name"
@@ -578,7 +578,7 @@ process fastqc {
 }
 
 
-/* STEP 2.0 - FastP
+/* STEP 1b - FastP
 * Optional poly-G complexity filtering step before read merging/adapter clipping etc
 * Note: Clipping, Merging, Quality Trimning are turned off here - we leave this to adapter removal itself!
 */
@@ -676,7 +676,7 @@ process adapter_removal {
 
 
 /*
-* STEP 2.1 - FastQC after clipping/merging (if applied!)
+* STEP 2b - FastQC after clipping/merging (if applied!)
 */
 process fastqc_after_clipping {
     tag "${name}"
@@ -698,7 +698,7 @@ process fastqc_after_clipping {
 }
 
 /*
-Step 3: Mapping with BWA, SAM to BAM, Sort BAM
+Step 3a  - Mapping with BWA, SAM to BAM, Sort BAM
 */
 
 process bwa {
@@ -713,7 +713,7 @@ process bwa {
 
 
     output:
-    file "*.sorted.bam" into ch_mapped_reads_idxstats,ch_mapped_reads_filter,ch_mapped_reads_preseq, ch_mapped_reads_damageprofiler, ch_bwa_mapped_reads_strip
+    file "*.sorted.bam" into ch_mapped_reads_flagstat,ch_mapped_reads_filter,ch_mapped_reads_preseq, ch_mapped_reads_damageprofiler, ch_bwa_mapped_reads_strip
     file "*.{bai,csi}" into ch_bam_index_for_damageprofiler
     
 
@@ -780,7 +780,7 @@ process circularmapper{
     file fasta from fasta_for_indexing
 
     output:
-    file "*.sorted.bam" into ch_mapped_reads_idxstats_cm,ch_mapped_reads_filter_cm,ch_mapped_reads_preseq_cm, ch_mapped_reads_damageprofiler_cm, ch_circular_mapped_reads_strip
+    file "*.sorted.bam" into ch_mapped_reads_flagstat_cm,ch_mapped_reads_filter_cm,ch_mapped_reads_preseq_cm, ch_mapped_reads_damageprofiler_cm, ch_circular_mapped_reads_strip
     file "*.{bai,csi}" 
     
     script:
@@ -824,7 +824,7 @@ process bwamem {
     file index from bwa_index_bwamem.collect()
 
     output:
-    file "*.sorted.bam" into ch_bwamem_mapped_reads_idxstats,ch_bwamem_mapped_reads_filter,ch_bwamem_mapped_reads_preseq, ch_bwamem_mapped_reads_damageprofiler, ch_bwamem_mapped_reads_strip
+    file "*.sorted.bam" into ch_bwamem_mapped_reads_flagstat,ch_bwamem_mapped_reads_filter,ch_bwamem_mapped_reads_preseq, ch_bwamem_mapped_reads_damageprofiler, ch_bwamem_mapped_reads_strip
     file "*.{bai,csi}" 
     
 
@@ -848,18 +848,18 @@ process bwamem {
 }
 
 /*
-* Step 4 - IDXStats
+* Step 3b - flagstat
 */
 
-process samtools_idxstats {
+process samtools_flagstat {
     tag "$prefix"
     publishDir "${params.outdir}/samtools/stats", mode: 'copy'
 
     input:
-    file(bam) from ch_mapped_reads_idxstats.mix(ch_mapped_reads_idxstats_cm,ch_bwamem_mapped_reads_idxstats)
+    file(bam) from ch_mapped_reads_flagstat.mix(ch_mapped_reads_flagstat_cm,ch_bwamem_mapped_reads_flagstat)
 
     output:
-    file "*.stats" into ch_idxstats_for_multiqc
+    file "*.stats" into ch_flagstat_for_multiqc
 
     script:
     prefix = "$bam" - ~/(\.bam)?$/
@@ -870,7 +870,7 @@ process samtools_idxstats {
 
 
 /*
-* Step 5: Keep unmapped/remove unmapped reads
+* Step 4a - Keep unmapped/remove unmapped reads
 */
 
 process samtools_filter {
@@ -887,7 +887,7 @@ process samtools_filter {
     file bam from ch_mapped_reads_filter.mix(ch_mapped_reads_filter_cm,ch_bwamem_mapped_reads_filter)
 
     output:
-    file "*filtered.bam" into ch_bam_filtered_qualimap, ch_bam_filtered_dedup, ch_bam_filtered_markdup, ch_bam_filtered_pmdtools, ch_bam_filtered_angsd, ch_bam_filtered_gatk
+    file "*filtered.bam" into ch_bam_filtered_flagstat, ch_bam_filtered_qualimap, ch_bam_filtered_dedup, ch_bam_filtered_markdup, ch_bam_filtered_pmdtools, ch_bam_filtered_angsd, ch_bam_filtered_gatk
     file "*.fastq.gz" optional true
     file "*.unmapped.bam" optional true
     file "*.{bai,csi}"
@@ -959,9 +959,31 @@ process strip_input_fastq {
     
 }
 
+/*
+* Step 4b: Keep unmapped/remove unmapped reads flagstat
+*/
+
+
+process samtools_flagstat_after_filter {
+    tag "$prefix"
+    publishDir "${params.outdir}/samtools/stats", mode: 'copy'
+
+    input:
+    file(bam) from ch_bam_filtered_flagstat
+
+    output:
+    file "*.stats" into ch_bam_filtered_flagstat_for_multiqc
+
+    script:
+    prefix = "$bam" - ~/(\.bam)?$/
+    """
+    samtools flagstat $bam > ${prefix}.stats
+    """
+}
+
 
 /*
-Step 6: DeDup / MarkDuplicates
+Step 5a: DeDup / MarkDuplicates
 */ 
 
 process dedup{
@@ -1004,7 +1026,7 @@ process dedup{
 }
 
 /*
-Step 5.1: Preseq
+Step 6: Preseq
 */
 
 process preseq {
@@ -1034,7 +1056,7 @@ process preseq {
 }
 
 /*
-Step 5.2: DMG Assessment
+Step 7a: DMG Assessment
 */ 
 
 process damageprofiler {
@@ -1062,7 +1084,7 @@ process damageprofiler {
 }
 
 /* 
-Step 5.3: Qualimap
+Step 8: Qualimap
 */
 
 process qualimap {
@@ -1090,7 +1112,7 @@ process qualimap {
 
 
 /*
- Step 6: MarkDuplicates
+ Step 5b: MarkDuplicates
  */
 
 process markDup{
@@ -1130,6 +1152,10 @@ if(!params.run_pmdtools){
     ch_dedup_for_pmdtools.close()
 }
 
+/*
+ Step 9: PMDtools
+ */
+
 process pmdtools {
     tag "${bam.baseName}"
     publishDir "${params.outdir}/pmdtools", mode: 'copy'
@@ -1162,7 +1188,7 @@ process pmdtools {
 }
 
 /*
-* Optional BAM Trimming step using bamUtils 
+* Step 10 - BAM Trimming step using bamUtils 
 * Can be used for UDGhalf protocols to clip off -n bases of each read
 */
 
@@ -1217,7 +1243,7 @@ Downstream VCF tools:
 
 
 /*
- * STEP 3 - Output Description HTML
+ * Step 11a - Output Description HTML
  */
 process output_documentation {
     publishDir "${params.outdir}/Documentation", mode: 'copy'
@@ -1236,7 +1262,7 @@ process output_documentation {
 
 
 /*
- * Parse software version numbers
+ * Step 11b - Parse software version numbers
  */
 process get_software_versions {
 
@@ -1271,7 +1297,7 @@ process get_software_versions {
 
 
 /*
- * STEP 2 - MultiQC
+ * Step 11c - MultiQC
  */
 process multiqc {
     publishDir "${params.outdir}/MultiQC", mode: 'copy'
@@ -1282,7 +1308,8 @@ process multiqc {
     file('fastqc/*') from ch_fastqc_after_clipping.collect().ifEmpty([])
     file ('software_versions/software_versions_mqc*') from software_versions_yaml.collect().ifEmpty([])
     file ('adapter_removal/*') from ch_adapterremoval_logs.collect().ifEmpty([])
-    file ('idxstats/*') from ch_idxstats_for_multiqc.collect().ifEmpty([])
+    file ('flagstat/*') from ch_flagstat_for_multiqc.collect().ifEmpty([])
+    file ('flagstat_filtered/*') from ch_bam_filtered_flagstat_for_multiqc.collect().ifEmpty([])
     file ('preseq/*') from ch_preseq_results.collect().ifEmpty([])
     file ('damageprofiler/dmgprof*/*') from ch_damageprofiler_results.collect().ifEmpty([])
     file ('qualimap/qualimap*/*') from ch_qualimap_results.collect().ifEmpty([])
@@ -1308,7 +1335,7 @@ process multiqc {
 
 
 /*
- * Completion e-mail notification
+ * Step 11d - Completion e-mail notification
  */
 workflow.onComplete {