Merge pull request #525 from nf-core/post-map-lenfilter

apeltzer · web-flow · commit ea3f7295de9c · 2020-07-24T20:01:46.000+02:00
Post mapping length filter
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -18,7 +18,7 @@ jobs:
     strategy:
       matrix:
         # Nextflow versions: check pipeline minimum and current latest
-        nxf_ver: ['19.10.0', '']
+        nxf_ver: ['20.04.0', '']
     steps:
       - name: Check out pipeline code
         uses: actions/checkout@v2
@@ -106,7 +106,10 @@ jobs:
           nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --strip_input_fastq
       - name: BAM_FILTERING Run basic mapping pipeline with mapping quality filtering, and unmapped export
         run: |
-          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_mapping_quality_threshold 37 --bam_discard_unmapped --bam_unmapped_type 'fastq'
+          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_mapping_quality_threshold 37  --bam_unmapped_type 'fastq'
+      - name: BAM_FILTERING Run basic mapping pipeline with post-mapping length filtering
+        run: |
+          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --clip_readlength 0 --run_bam_filtering --bam_filter_minreadlength 50
       - name: DEDUPLICATION Test with dedup
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --dedupper 'dedup' --dedup_all_merged
@@ -160,17 +163,17 @@ jobs:
           for i in index0.idx ref.db ref.idx ref.inf table0.db table0.idx taxonomy.idx taxonomy.map taxonomy.tre; do wget https://github.com/nf-core/test-datasets/raw/eager/databases/malt/"$i" -P databases/malt/; done
       - name: METAGENOMIC Run the basic pipeline but with unmapped reads going into MALT
         run: |
-          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_discard_unmapped --bam_unmapped_type 'fastq' --run_metagenomic_screening --database "/home/runner/work/eager/eager/databases/malt/"
+          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering  --bam_unmapped_type 'fastq' --run_metagenomic_screening --database "/home/runner/work/eager/eager/databases/malt/"
       - name: MALTEXTRACT Download resource files
         run: |
             mkdir -p databases/maltextract
             for i in ncbi.tre ncbi.map; do wget https://github.com/rhuebler/HOPS/raw/0.33/Resources/"$i" -P databases/maltextract/; done
       - name: MALTEXTRACT Basic with MALT plus MaltExtract
         run: |
-          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_discard_unmapped --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt" --run_maltextract --maltextract_ncbifiles "/home/runner/work/eager/eager/databases/maltextract/" --maltextract_taxon_list 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/maltextract/MaltExtract_list.txt' 
+          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering  --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt" --run_maltextract --maltextract_ncbifiles "/home/runner/work/eager/eager/databases/maltextract/" --maltextract_taxon_list 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/maltextract/MaltExtract_list.txt' 
       - name: METAGENOMIC Run the basic pipeline but with unmapped reads going into Kraken
         run: |
-          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_kraken,docker --run_bam_filtering --bam_discard_unmapped --bam_unmapped_type 'fastq'
+          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_kraken,docker --run_bam_filtering  --bam_unmapped_type 'fastq'
       - name: SEXDETERMINATION Run the basic pipeline with the bam input profile, but don't convert BAM, skip everything but sex determination
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_humanbam,docker --skip_fastqc --skip_adapterremoval --skip_deduplication --skip_qualimap --run_sexdeterrmine
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -24,11 +24,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 * General documentation additions and cleaning, updated figures with CC-BY license
 * Added large 'fullsize' dataset test-profiles for ancient fish, human, and a draft pathogen contexts.
 * [#257](https://github.com/nf-core/eager/issues/257) Added the bowtie2 aligner as option for mapping, following Poullet and Orlando 2020 doi: [10.3389/fevo.2020.00105](https://doi.org/10.3389/fevo.2020.00105)
-* [#451] Adds ANGSD genotype likelihood calculations as alternative to typical 'genotypers'
-* [#504] Removed sexdeterrmine-snps plot from MultiQC report.
+* [#451](https://github.com/nf-core/eager/issues/451) Adds ANGSD genotype likelihood calculations as alternative to typical 'genotypers'
 * Nuclear contamination results are now shown in the MultiQC report.
 * Tutorial on how to use profiles for reproducible science (i.e. parameter sharing between different groups)
-* Added flexible trimming of bams by library type. 'half' and 'none' UDG libraries can now be trimmed differentially within a single eager run.
+* [#522](https://github.com/nf-core/eager/issues/522)  Added post-mapping length filter to asisst in more realistic endogenous DNA calculations
+* [#512](https://github.com/nf-core/eager/issues/512) Added flexible trimming of bams by library type. 'half' and 'none' UDG libraries can now be trimmed differentially within a single eager run.
 
 ### `Fixed`
 
@@ -48,6 +48,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 * [#501](https://github.com/nf-core/eager/issues/501) - Adds additional validation checks for MALT/MaltExtract database input files
 * [#508](https://github.com/nf-core/eager/issues/508) - Made Markduplicates default dedupper due to narrower context specificity of dedup
 * [#516](https://github.com/nf-core/eager/issues/516) - Made bedtools not report out of memory exit code when warning of inconsistant FASTA/Bed entry names
+* [#504](https://github.com/nf-core/eager/issues/504) - Removed uninformative sexdeterrmine-snps plot from MultiQC report.
 * Nuclear contamination is now reported with the correct library names.
 
 ### `Dependencies`
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 
 ![GitHub Actions CI Status](https://github.com/nf-core/eager/workflows/nf-core%20eager%20CI/badge.svg)
 ![GitHub Actions Linting Status](https://github.com/nf-core/eager/workflows/nf-core%20linting/badge.svg)
-[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A519.10.0-brightgreen.svg)](https://www.nextflow.io/)
+[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A520.04.0-brightgreen.svg)](https://www.nextflow.io/)
 [![nf-core](https://img.shields.io/badge/nf--core-pipeline-brightgreen.svg)](https://nf-co.re/)
 [![DOI](https://zenodo.org/badge/135918251.svg)](https://zenodo.org/badge/latestdoi/135918251)
 
diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml
@@ -162,7 +162,7 @@ table_columns_visible:
         percentage_aligned: False
     MultiVCFAnalyzer:
         Heterozygous SNP alleles (percent): True
-    custom_content:
+    endorS.py :
         endogenous_dna: True
         endogenous_dna_post: True
     nuclear_contamination:
@@ -205,9 +205,10 @@ table_columns_placement:
     Samtools Flagstat (post-samtools filter):
         flagstat_total: 553
         mapped_passed: 554
-    custom_content:
+    endorS.py :
         endogenous_dna: 600
         endogenous_dna_post: 610
+    nuclear_contamination:
         Num_SNPs: 1100
         Method1_MOM_estimate: 1110
         Method1_MOM_SE: 1120
diff --git a/docs/usage.md b/docs/usage.md
@@ -549,6 +549,8 @@ Defines the adapter sequence to be used for the reverse read in paired end seque
 
 Defines the minimum read length that is required for reads after merging to be considered for downstream analysis after read merging. Default is `30`.
 
+Note that performing read length filtering at this step is not reliable for correct endogenous DNA calculation, when you have a large percentage of very short reads in your library - such retrieved in single-stranded library protocols. When you have very few reads passing this length filter, it will artificially inflate your endogenous DNA by creating a very small denominator. In these cases it is recommended to set this to 0, and use `--bam_filter_minreadlength` to instead, to filter out 'unusuable' short reads after mapping.
+
 #### `--clip_min_read_quality`
 
 Defines the minimum read quality per base that is required for a base to be kept. Individual bases at the ends of reads falling below this threshold will be clipped off. Default is set to `20`.
@@ -705,6 +707,12 @@ Note that in all cases, if `--bam_mapping_quality_threshold` is also supplied, m
 
 Specify a mapping quality threshold for mapped reads to be kept for downstream analysis. By default keeps all reads and is therefore set to `0` (basically doesn't filter anything).
 
+#### `bam_filter_minreadlength`
+
+Specify minimum length of mapped reads. This filtering will apply at the same time as mapping quality filtering.
+
+If used _instead_ of minimum length read filtering at AdapterRemoval, this can be useful to get more realistic endogenous DNA percentages, when most of your reads are very short (e.g. in single-stranded libraries) and would otherwise be discarded by AdapterRemoval (thus making an artifically small denominator for a typical endogenous DNA calculation). Note in this context you should not perform mapping quality filtering nor discarding of unmapped reads to ensure a correct 'denominator' of 'all reads', for the Endogenous DNA calculation.
+
 ### Read DeDuplication Parameters
 
 If using TSV input, deduplication is performed library, i.e. after lane merging.
diff --git a/main.nf b/main.nf
@@ -105,7 +105,8 @@ def helpMessage() {
       --run_bam_filtering                Turn on samtools filter for mapping quality or unmapped reads of BAM files.
       --bam_mapping_quality_threshold    Minimum mapping quality for reads filter. Default: ${params.bam_mapping_quality_threshold}
       --bam_unmapped_type                Defines whether to discard all unmapped reads, keep both mapped and unmapped together, or save as bam and/or only fastq format Options: 'discard', 'bam', 'keep', 'fastq', 'both'. Default: ${params.bam_unmapped_type}
-    
+      --bam_filter_minreadlength         Specify minimum read length to be kept after mapping.
+
     DeDuplication
       --dedupper                    Deduplication method to use. Options: 'dedup', 'markduplicates'. Default: '${params.dedupper}'
       --dedup_all_merged            Turn on treating all reads as merged reads.
@@ -1721,40 +1722,93 @@ process samtools_filter {
     tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.unmapped.fastq.gz") optional true into ch_bam_filtering_for_metagenomic
     tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.unmapped.bam") optional true
 
-    script:
-    def size = params.large_ref ? '-c' : ''
+    // Using shell block rather than script because we are playing with awk
+    shell:
+    size = !{params.large_ref} ? '-c' : ''
     
     if ( "${params.bam_unmapped_type}" == "keep" ) {
-        """
-        samtools view -h -b $bam -@ ${task.cpus} -q ${params.bam_mapping_quality_threshold} -o ${libraryid}.filtered.bam
-        samtools index ${libraryid}.filtered.bam ${size} 
-        """
+        '''
+        ## Unmapped and MAPQ filtering
+        samtools view -h -b !{bam} -@ !{task.cpus} -q !{params.bam_mapping_quality_threshold} -o tmp_mapped.bam
+
+        ## Mapped LEN filtering
+        if [[ !{minlength} -eq 0 ]]; then
+            mv tmp_mapped.bam !{libraryid}.filtered.bam
+        else
+            ## From https://www.biostars.org/p/92889/#92908; note may not be optimal for un-merged reads (i.e. reads with long fragment lenths)
+            samtools view -h tmp_mapped.bam | awk 'length($10) >= !{minlength} || $1 ~ /^@/' | samtools view -bS - > !{libraryid}.filtered.bam
+        fi
+
+        samtools index !{libraryid}.filtered.bam !{size}
+        '''
     } else if("${params.bam_unmapped_type}" == "discard"){
-        """
-        samtools view -h -b $bam -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${libraryid}.filtered.bam
-        samtools index ${libraryid}.filtered.bam ${size}
-        """
+        '''
+        ## Unmapped and MAPQ filtering
+        samtools view -h -b !{bam} -@ !{task.cpus} -F4 -q !{params.bam_mapping_quality_threshold} -o tmp_mapped.bam
+
+        ## Mapped LEN filtering
+        if [[ !{params.bam_filter_minreadlength} -eq 0 ]]; then
+            mv tmp_mapped.bam !{libraryid}.filtered.bam
+        else
+            ## From https://www.biostars.org/p/92889/#92908; note may not be optimal for un-merged reads (i.e. reads with long fragment lenths)
+            samtools view -h tmp_mapped.bam | awk 'length($10) >= !{params.bam_filter_minreadlength} || $1 ~ /^@/' | samtools view -bS - > !{libraryid}.filtered.bam
+        fi
+
+        samtools index !{libraryid}.filtered.bam !{size}
+        '''
     } else if("${params.bam_unmapped_type}" == "bam"){
-        """
-        samtools view -h $bam | samtools view - -@ ${task.cpus} -f4 -o ${libraryid}.unmapped.bam
-        samtools view -h $bam | samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${libraryid}.filtered.bam
-        samtools index ${libraryid}.filtered.bam ${size}
-        """
+        '''
+        ## Unmapped and MAPQ filtering
+        samtools view -h !{bam} | samtools view - -@ !{task.cpus} -f4 -o !{libraryid}.unmapped.bam
+        samtools view -h !{bam} | samtools view - -@ !{task.cpus} -F4 -q !{params.bam_mapping_quality_threshold} -o tmp_mapped.bam
+
+        ## Mapped LEN filtering
+        if [[ !{params.bam_filter_minreadlength} -eq 0 ]]; then
+            mv tmp_mapped.bam !{libraryid}.filtered.bam
+        else
+            ## From https://www.biostars.org/p/92889/#92908; note may not be optimal for un-merged reads (i.e. reads with long fragment lenths)
+            samtools view -h tmp_mapped.bam | awk 'length($10) >= !{params.bam_filter_minreadlength} || $1 ~ /^@/' | samtools view -bS - > !{libraryid}.filtered.bam
+        fi
+
+        samtools index !{libraryid}.filtered.bam !{size}
+        '''
     } else if("${params.bam_unmapped_type}" == "fastq"){
-        """
-        samtools view -h $bam | samtools view - -@ ${task.cpus} -f4 -o ${libraryid}.unmapped.bam
-        samtools view -h $bam | samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${libraryid}.filtered.bam
-        samtools index ${libraryid}.filtered.bam ${size}
-        samtools fastq -tn ${libraryid}.unmapped.bam | pigz -p ${task.cpus} > ${libraryid}.unmapped.fastq.gz
-        rm ${libraryid}.unmapped.bam
-        """
+        '''
+        ## Unmapped and MAPQ filtering
+        samtools view -h !{bam} | samtools view - -@ !{task.cpus} -f4 -o !{libraryid}.unmapped.bam
+        samtools view -h !{bam} | samtools view - -@ !{task.cpus} -F4 -q !{params.bam_mapping_quality_threshold} -o tmp_mapped.bam
+
+        ## Mapped LEN filtering
+        if [[ !{params.bam_filter_minreadlength} -eq 0 ]]; then
+            mv tmp_mapped.bam !{libraryid}.filtered.bam
+        else
+            ## From https://www.biostars.org/p/92889/#92908; note may not be optimal for un-merged reads (i.e. reads with long fragment lenths)
+            samtools view -h tmp_mapped.bam | awk 'length($10) >= !{params.bam_filter_minreadlength} || $1 ~ /^@/' | samtools view -bS - > !{libraryid}.filtered.bam
+        fi
+
+        samtools index !{libraryid}.filtered.bam !{size}
+
+        ## FASTQ
+        samtools fastq -tn !{libraryid}.unmapped.bam | pigz -p !{task.cpus} > !{libraryid}.unmapped.fastq.gz
+        rm !{libraryid}.unmapped.bam
+        '''
     } else if("${params.bam_unmapped_type}" == "both"){
-        """
-        samtools view -h $bam | samtools view - -@ ${task.cpus} -f4 -o ${libraryid}.unmapped.bam
-        samtools view -h $bam | samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${libraryid}.filtered.bam
-        samtools index ${libraryid}.filtered.bam ${size}
-        samtools fastq -tn ${libraryid}.unmapped.bam | pigz -p ${task.cpus} > ${libraryid}.unmapped.fastq.gz
-        """
+        '''
+        ## Unmapped and MAPQ filtering
+        samtools view -h !{bam} | samtools view - -@ !{task.cpus} -f4 -o !{libraryid}.unmapped.bam
+        samtools view -h !{bam} | samtools view - -@ !{task.cpus} -F4 -q !{params.bam_mapping_quality_threshold} -o tmp_mapped.bam
+
+        ## Mapped LEN filtering
+        if [[ !{params.bam_filter_minreadlength} -eq 0 ]]; then
+            mv tmp_mapped.bam !{libraryid}.filtered.bam
+        else
+            ## From https://www.biostars.org/p/92889/#92908; note may not be optimal for un-merged reads (i.e. reads with long fragment lenths)
+            samtools view -h tmp_mapped.bam | awk 'length($10) >= !{params.bam_filter_minreadlength} || $1 ~ /^@/' | samtools view -bS - > !{libraryid}.filtered.bam
+        fi
+
+        samtools index !{libraryid}.filtered.bam !{size}
+        samtools fastq -tn !{libraryid}.unmapped.bam | pigz -p !{task.cpus} > !{libraryid}.unmapped.fastq.gz
+        '''
     }
 }
 
@@ -1795,6 +1849,7 @@ process samtools_flagstat_after_filter {
 if (params.run_bam_filtering) {
   ch_flagstat_for_endorspy
     .join(ch_bam_filtered_flagstat_for_endorspy, by: [0,1,2,3,4,5,6])
+    .dump(tag: "Joined")
     .set{ ch_allflagstats_for_endorspy }
 
 } else {
diff --git a/nextflow.config b/nextflow.config
@@ -81,6 +81,7 @@ params {
   run_bam_filtering = false
   bam_unmapped_type = 'discard'
   bam_mapping_quality_threshold = 0
+  bam_filter_minreadlength = 0
 
   //DamageProfiler settings
   damageprofiler_length = 100
@@ -339,7 +340,7 @@ manifest {
   homePage = 'https://github.com/nf-core/eager'
   description = 'A fully reproducible ancient and modern DNA pipeline in Nextflow and with cloud support.'
   mainScript = 'main.nf'
-  nextflowVersion = '!>=19.10.0'
+  nextflowVersion = '!>=20.04.0'
   version = '2.2.0dev'
 }
 // Function to ensure that resource requirements don't go beyond