Skip to content

Commit ea3f729

Browse files
authored
Merge pull request #525 from nf-core/post-map-lenfilter
Post mapping length filter
2 parents f3ca5fc + 1d6f7e4 commit ea3f729

7 files changed

Lines changed: 110 additions & 41 deletions

File tree

.github/workflows/ci.yml

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ jobs:
1818
strategy:
1919
matrix:
2020
# Nextflow versions: check pipeline minimum and current latest
21-
nxf_ver: ['19.10.0', '']
21+
nxf_ver: ['20.04.0', '']
2222
steps:
2323
- name: Check out pipeline code
2424
uses: actions/checkout@v2
@@ -106,7 +106,10 @@ jobs:
106106
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --strip_input_fastq
107107
- name: BAM_FILTERING Run basic mapping pipeline with mapping quality filtering, and unmapped export
108108
run: |
109-
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_mapping_quality_threshold 37 --bam_discard_unmapped --bam_unmapped_type 'fastq'
109+
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_mapping_quality_threshold 37 --bam_unmapped_type 'fastq'
110+
- name: BAM_FILTERING Run basic mapping pipeline with post-mapping length filtering
111+
run: |
112+
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --clip_readlength 0 --run_bam_filtering --bam_filter_minreadlength 50
110113
- name: DEDUPLICATION Test with dedup
111114
run: |
112115
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --dedupper 'dedup' --dedup_all_merged
@@ -160,17 +163,17 @@ jobs:
160163
for i in index0.idx ref.db ref.idx ref.inf table0.db table0.idx taxonomy.idx taxonomy.map taxonomy.tre; do wget https://github.com/nf-core/test-datasets/raw/eager/databases/malt/"$i" -P databases/malt/; done
161164
- name: METAGENOMIC Run the basic pipeline but with unmapped reads going into MALT
162165
run: |
163-
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_discard_unmapped --bam_unmapped_type 'fastq' --run_metagenomic_screening --database "/home/runner/work/eager/eager/databases/malt/"
166+
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_unmapped_type 'fastq' --run_metagenomic_screening --database "/home/runner/work/eager/eager/databases/malt/"
164167
- name: MALTEXTRACT Download resource files
165168
run: |
166169
mkdir -p databases/maltextract
167170
for i in ncbi.tre ncbi.map; do wget https://github.com/rhuebler/HOPS/raw/0.33/Resources/"$i" -P databases/maltextract/; done
168171
- name: MALTEXTRACT Basic with MALT plus MaltExtract
169172
run: |
170-
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_discard_unmapped --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt" --run_maltextract --maltextract_ncbifiles "/home/runner/work/eager/eager/databases/maltextract/" --maltextract_taxon_list 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/maltextract/MaltExtract_list.txt'
173+
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt" --run_maltextract --maltextract_ncbifiles "/home/runner/work/eager/eager/databases/maltextract/" --maltextract_taxon_list 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/maltextract/MaltExtract_list.txt'
171174
- name: METAGENOMIC Run the basic pipeline but with unmapped reads going into Kraken
172175
run: |
173-
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_kraken,docker --run_bam_filtering --bam_discard_unmapped --bam_unmapped_type 'fastq'
176+
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_kraken,docker --run_bam_filtering --bam_unmapped_type 'fastq'
174177
- name: SEXDETERMINATION Run the basic pipeline with the bam input profile, but don't convert BAM, skip everything but sex determination
175178
run: |
176179
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_humanbam,docker --skip_fastqc --skip_adapterremoval --skip_deduplication --skip_qualimap --run_sexdeterrmine

CHANGELOG.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
2424
* General documentation additions and cleaning, updated figures with CC-BY license
2525
* Added large 'fullsize' dataset test-profiles for ancient fish, human, and a draft pathogen contexts.
2626
* [#257](https://github.com/nf-core/eager/issues/257) Added the bowtie2 aligner as option for mapping, following Poullet and Orlando 2020 doi: [10.3389/fevo.2020.00105](https://doi.org/10.3389/fevo.2020.00105)
27-
* [#451] Adds ANGSD genotype likelihood calculations as alternative to typical 'genotypers'
28-
* [#504] Removed sexdeterrmine-snps plot from MultiQC report.
27+
* [#451](https://github.com/nf-core/eager/issues/451) Adds ANGSD genotype likelihood calculations as alternative to typical 'genotypers'
2928
* Nuclear contamination results are now shown in the MultiQC report.
3029
* Tutorial on how to use profiles for reproducible science (i.e. parameter sharing between different groups)
31-
* Added flexible trimming of bams by library type. 'half' and 'none' UDG libraries can now be trimmed differentially within a single eager run.
30+
* [#522](https://github.com/nf-core/eager/issues/522) Added post-mapping length filter to asisst in more realistic endogenous DNA calculations
31+
* [#512](https://github.com/nf-core/eager/issues/512) Added flexible trimming of bams by library type. 'half' and 'none' UDG libraries can now be trimmed differentially within a single eager run.
3232

3333
### `Fixed`
3434

@@ -48,6 +48,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
4848
* [#501](https://github.com/nf-core/eager/issues/501) - Adds additional validation checks for MALT/MaltExtract database input files
4949
* [#508](https://github.com/nf-core/eager/issues/508) - Made Markduplicates default dedupper due to narrower context specificity of dedup
5050
* [#516](https://github.com/nf-core/eager/issues/516) - Made bedtools not report out of memory exit code when warning of inconsistant FASTA/Bed entry names
51+
* [#504](https://github.com/nf-core/eager/issues/504) - Removed uninformative sexdeterrmine-snps plot from MultiQC report.
5152
* Nuclear contamination is now reported with the correct library names.
5253

5354
### `Dependencies`

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
![GitHub Actions CI Status](https://github.com/nf-core/eager/workflows/nf-core%20eager%20CI/badge.svg)
66
![GitHub Actions Linting Status](https://github.com/nf-core/eager/workflows/nf-core%20linting/badge.svg)
7-
[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A519.10.0-brightgreen.svg)](https://www.nextflow.io/)
7+
[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A520.04.0-brightgreen.svg)](https://www.nextflow.io/)
88
[![nf-core](https://img.shields.io/badge/nf--core-pipeline-brightgreen.svg)](https://nf-co.re/)
99
[![DOI](https://zenodo.org/badge/135918251.svg)](https://zenodo.org/badge/latestdoi/135918251)
1010

assets/multiqc_config.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ table_columns_visible:
162162
percentage_aligned: False
163163
MultiVCFAnalyzer:
164164
Heterozygous SNP alleles (percent): True
165-
custom_content:
165+
endorS.py :
166166
endogenous_dna: True
167167
endogenous_dna_post: True
168168
nuclear_contamination:
@@ -205,9 +205,10 @@ table_columns_placement:
205205
Samtools Flagstat (post-samtools filter):
206206
flagstat_total: 553
207207
mapped_passed: 554
208-
custom_content:
208+
endorS.py :
209209
endogenous_dna: 600
210210
endogenous_dna_post: 610
211+
nuclear_contamination:
211212
Num_SNPs: 1100
212213
Method1_MOM_estimate: 1110
213214
Method1_MOM_SE: 1120

docs/usage.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -549,6 +549,8 @@ Defines the adapter sequence to be used for the reverse read in paired end seque
549549

550550
Defines the minimum read length that is required for reads after merging to be considered for downstream analysis after read merging. Default is `30`.
551551

552+
Note that performing read length filtering at this step is not reliable for correct endogenous DNA calculation, when you have a large percentage of very short reads in your library - such retrieved in single-stranded library protocols. When you have very few reads passing this length filter, it will artificially inflate your endogenous DNA by creating a very small denominator. In these cases it is recommended to set this to 0, and use `--bam_filter_minreadlength` to instead, to filter out 'unusuable' short reads after mapping.
553+
552554
#### `--clip_min_read_quality`
553555

554556
Defines the minimum read quality per base that is required for a base to be kept. Individual bases at the ends of reads falling below this threshold will be clipped off. Default is set to `20`.
@@ -705,6 +707,12 @@ Note that in all cases, if `--bam_mapping_quality_threshold` is also supplied, m
705707

706708
Specify a mapping quality threshold for mapped reads to be kept for downstream analysis. By default keeps all reads and is therefore set to `0` (basically doesn't filter anything).
707709

710+
#### `bam_filter_minreadlength`
711+
712+
Specify minimum length of mapped reads. This filtering will apply at the same time as mapping quality filtering.
713+
714+
If used _instead_ of minimum length read filtering at AdapterRemoval, this can be useful to get more realistic endogenous DNA percentages, when most of your reads are very short (e.g. in single-stranded libraries) and would otherwise be discarded by AdapterRemoval (thus making an artifically small denominator for a typical endogenous DNA calculation). Note in this context you should not perform mapping quality filtering nor discarding of unmapped reads to ensure a correct 'denominator' of 'all reads', for the Endogenous DNA calculation.
715+
708716
### Read DeDuplication Parameters
709717

710718
If using TSV input, deduplication is performed library, i.e. after lane merging.

main.nf

Lines changed: 84 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,8 @@ def helpMessage() {
105105
--run_bam_filtering Turn on samtools filter for mapping quality or unmapped reads of BAM files.
106106
--bam_mapping_quality_threshold Minimum mapping quality for reads filter. Default: ${params.bam_mapping_quality_threshold}
107107
--bam_unmapped_type Defines whether to discard all unmapped reads, keep both mapped and unmapped together, or save as bam and/or only fastq format Options: 'discard', 'bam', 'keep', 'fastq', 'both'. Default: ${params.bam_unmapped_type}
108-
108+
--bam_filter_minreadlength Specify minimum read length to be kept after mapping.
109+
109110
DeDuplication
110111
--dedupper Deduplication method to use. Options: 'dedup', 'markduplicates'. Default: '${params.dedupper}'
111112
--dedup_all_merged Turn on treating all reads as merged reads.
@@ -1721,40 +1722,93 @@ process samtools_filter {
17211722
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.unmapped.fastq.gz") optional true into ch_bam_filtering_for_metagenomic
17221723
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.unmapped.bam") optional true
17231724

1724-
script:
1725-
def size = params.large_ref ? '-c' : ''
1725+
// Using shell block rather than script because we are playing with awk
1726+
shell:
1727+
size = !{params.large_ref} ? '-c' : ''
17261728

17271729
if ( "${params.bam_unmapped_type}" == "keep" ) {
1728-
"""
1729-
samtools view -h -b $bam -@ ${task.cpus} -q ${params.bam_mapping_quality_threshold} -o ${libraryid}.filtered.bam
1730-
samtools index ${libraryid}.filtered.bam ${size}
1731-
"""
1730+
'''
1731+
## Unmapped and MAPQ filtering
1732+
samtools view -h -b !{bam} -@ !{task.cpus} -q !{params.bam_mapping_quality_threshold} -o tmp_mapped.bam
1733+
1734+
## Mapped LEN filtering
1735+
if [[ !{minlength} -eq 0 ]]; then
1736+
mv tmp_mapped.bam !{libraryid}.filtered.bam
1737+
else
1738+
## From https://www.biostars.org/p/92889/#92908; note may not be optimal for un-merged reads (i.e. reads with long fragment lenths)
1739+
samtools view -h tmp_mapped.bam | awk 'length($10) >= !{minlength} || $1 ~ /^@/' | samtools view -bS - > !{libraryid}.filtered.bam
1740+
fi
1741+
1742+
samtools index !{libraryid}.filtered.bam !{size}
1743+
'''
17321744
} else if("${params.bam_unmapped_type}" == "discard"){
1733-
"""
1734-
samtools view -h -b $bam -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${libraryid}.filtered.bam
1735-
samtools index ${libraryid}.filtered.bam ${size}
1736-
"""
1745+
'''
1746+
## Unmapped and MAPQ filtering
1747+
samtools view -h -b !{bam} -@ !{task.cpus} -F4 -q !{params.bam_mapping_quality_threshold} -o tmp_mapped.bam
1748+
1749+
## Mapped LEN filtering
1750+
if [[ !{params.bam_filter_minreadlength} -eq 0 ]]; then
1751+
mv tmp_mapped.bam !{libraryid}.filtered.bam
1752+
else
1753+
## From https://www.biostars.org/p/92889/#92908; note may not be optimal for un-merged reads (i.e. reads with long fragment lenths)
1754+
samtools view -h tmp_mapped.bam | awk 'length($10) >= !{params.bam_filter_minreadlength} || $1 ~ /^@/' | samtools view -bS - > !{libraryid}.filtered.bam
1755+
fi
1756+
1757+
samtools index !{libraryid}.filtered.bam !{size}
1758+
'''
17371759
} else if("${params.bam_unmapped_type}" == "bam"){
1738-
"""
1739-
samtools view -h $bam | samtools view - -@ ${task.cpus} -f4 -o ${libraryid}.unmapped.bam
1740-
samtools view -h $bam | samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${libraryid}.filtered.bam
1741-
samtools index ${libraryid}.filtered.bam ${size}
1742-
"""
1760+
'''
1761+
## Unmapped and MAPQ filtering
1762+
samtools view -h !{bam} | samtools view - -@ !{task.cpus} -f4 -o !{libraryid}.unmapped.bam
1763+
samtools view -h !{bam} | samtools view - -@ !{task.cpus} -F4 -q !{params.bam_mapping_quality_threshold} -o tmp_mapped.bam
1764+
1765+
## Mapped LEN filtering
1766+
if [[ !{params.bam_filter_minreadlength} -eq 0 ]]; then
1767+
mv tmp_mapped.bam !{libraryid}.filtered.bam
1768+
else
1769+
## From https://www.biostars.org/p/92889/#92908; note may not be optimal for un-merged reads (i.e. reads with long fragment lenths)
1770+
samtools view -h tmp_mapped.bam | awk 'length($10) >= !{params.bam_filter_minreadlength} || $1 ~ /^@/' | samtools view -bS - > !{libraryid}.filtered.bam
1771+
fi
1772+
1773+
samtools index !{libraryid}.filtered.bam !{size}
1774+
'''
17431775
} else if("${params.bam_unmapped_type}" == "fastq"){
1744-
"""
1745-
samtools view -h $bam | samtools view - -@ ${task.cpus} -f4 -o ${libraryid}.unmapped.bam
1746-
samtools view -h $bam | samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${libraryid}.filtered.bam
1747-
samtools index ${libraryid}.filtered.bam ${size}
1748-
samtools fastq -tn ${libraryid}.unmapped.bam | pigz -p ${task.cpus} > ${libraryid}.unmapped.fastq.gz
1749-
rm ${libraryid}.unmapped.bam
1750-
"""
1776+
'''
1777+
## Unmapped and MAPQ filtering
1778+
samtools view -h !{bam} | samtools view - -@ !{task.cpus} -f4 -o !{libraryid}.unmapped.bam
1779+
samtools view -h !{bam} | samtools view - -@ !{task.cpus} -F4 -q !{params.bam_mapping_quality_threshold} -o tmp_mapped.bam
1780+
1781+
## Mapped LEN filtering
1782+
if [[ !{params.bam_filter_minreadlength} -eq 0 ]]; then
1783+
mv tmp_mapped.bam !{libraryid}.filtered.bam
1784+
else
1785+
## From https://www.biostars.org/p/92889/#92908; note may not be optimal for un-merged reads (i.e. reads with long fragment lenths)
1786+
samtools view -h tmp_mapped.bam | awk 'length($10) >= !{params.bam_filter_minreadlength} || $1 ~ /^@/' | samtools view -bS - > !{libraryid}.filtered.bam
1787+
fi
1788+
1789+
samtools index !{libraryid}.filtered.bam !{size}
1790+
1791+
## FASTQ
1792+
samtools fastq -tn !{libraryid}.unmapped.bam | pigz -p !{task.cpus} > !{libraryid}.unmapped.fastq.gz
1793+
rm !{libraryid}.unmapped.bam
1794+
'''
17511795
} else if("${params.bam_unmapped_type}" == "both"){
1752-
"""
1753-
samtools view -h $bam | samtools view - -@ ${task.cpus} -f4 -o ${libraryid}.unmapped.bam
1754-
samtools view -h $bam | samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${libraryid}.filtered.bam
1755-
samtools index ${libraryid}.filtered.bam ${size}
1756-
samtools fastq -tn ${libraryid}.unmapped.bam | pigz -p ${task.cpus} > ${libraryid}.unmapped.fastq.gz
1757-
"""
1796+
'''
1797+
## Unmapped and MAPQ filtering
1798+
samtools view -h !{bam} | samtools view - -@ !{task.cpus} -f4 -o !{libraryid}.unmapped.bam
1799+
samtools view -h !{bam} | samtools view - -@ !{task.cpus} -F4 -q !{params.bam_mapping_quality_threshold} -o tmp_mapped.bam
1800+
1801+
## Mapped LEN filtering
1802+
if [[ !{params.bam_filter_minreadlength} -eq 0 ]]; then
1803+
mv tmp_mapped.bam !{libraryid}.filtered.bam
1804+
else
1805+
## From https://www.biostars.org/p/92889/#92908; note may not be optimal for un-merged reads (i.e. reads with long fragment lenths)
1806+
samtools view -h tmp_mapped.bam | awk 'length($10) >= !{params.bam_filter_minreadlength} || $1 ~ /^@/' | samtools view -bS - > !{libraryid}.filtered.bam
1807+
fi
1808+
1809+
samtools index !{libraryid}.filtered.bam !{size}
1810+
samtools fastq -tn !{libraryid}.unmapped.bam | pigz -p !{task.cpus} > !{libraryid}.unmapped.fastq.gz
1811+
'''
17581812
}
17591813
}
17601814

@@ -1795,6 +1849,7 @@ process samtools_flagstat_after_filter {
17951849
if (params.run_bam_filtering) {
17961850
ch_flagstat_for_endorspy
17971851
.join(ch_bam_filtered_flagstat_for_endorspy, by: [0,1,2,3,4,5,6])
1852+
.dump(tag: "Joined")
17981853
.set{ ch_allflagstats_for_endorspy }
17991854

18001855
} else {

nextflow.config

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ params {
8181
run_bam_filtering = false
8282
bam_unmapped_type = 'discard'
8383
bam_mapping_quality_threshold = 0
84+
bam_filter_minreadlength = 0
8485

8586
//DamageProfiler settings
8687
damageprofiler_length = 100
@@ -339,7 +340,7 @@ manifest {
339340
homePage = 'https://github.com/nf-core/eager'
340341
description = 'A fully reproducible ancient and modern DNA pipeline in Nextflow and with cloud support.'
341342
mainScript = 'main.nf'
342-
nextflowVersion = '!>=19.10.0'
343+
nextflowVersion = '!>=20.04.0'
343344
version = '2.2.0dev'
344345
}
345346
// Function to ensure that resource requirements don't go beyond

0 commit comments

Comments
 (0)