Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
strategy:
matrix:
# Nextflow versions: check pipeline minimum and current latest
nxf_ver: ['19.10.0', '']
nxf_ver: ['20.04.0', '']
steps:
- name: Check out pipeline code
uses: actions/checkout@v2
Expand Down Expand Up @@ -106,7 +106,10 @@ jobs:
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --strip_input_fastq
- name: BAM_FILTERING Run basic mapping pipeline with mapping quality filtering, and unmapped export
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_mapping_quality_threshold 37 --bam_discard_unmapped --bam_unmapped_type 'fastq'
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_mapping_quality_threshold 37 --bam_unmapped_type 'fastq'
- name: BAM_FILTERING Run basic mapping pipeline with post-mapping length filtering
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --clip_readlength 0 --run_bam_filtering --bam_filter_minreadlength 50
- name: DEDUPLICATION Test with dedup
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --dedupper 'dedup' --dedup_all_merged
Expand Down Expand Up @@ -160,17 +163,17 @@ jobs:
for i in index0.idx ref.db ref.idx ref.inf table0.db table0.idx taxonomy.idx taxonomy.map taxonomy.tre; do wget https://github.com/nf-core/test-datasets/raw/eager/databases/malt/"$i" -P databases/malt/; done
- name: METAGENOMIC Run the basic pipeline but with unmapped reads going into MALT
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_discard_unmapped --bam_unmapped_type 'fastq' --run_metagenomic_screening --database "/home/runner/work/eager/eager/databases/malt/"
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_unmapped_type 'fastq' --run_metagenomic_screening --database "/home/runner/work/eager/eager/databases/malt/"
- name: MALTEXTRACT Download resource files
run: |
mkdir -p databases/maltextract
for i in ncbi.tre ncbi.map; do wget https://github.com/rhuebler/HOPS/raw/0.33/Resources/"$i" -P databases/maltextract/; done
- name: MALTEXTRACT Basic with MALT plus MaltExtract
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_discard_unmapped --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt" --run_maltextract --maltextract_ncbifiles "/home/runner/work/eager/eager/databases/maltextract/" --maltextract_taxon_list 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/maltextract/MaltExtract_list.txt'
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt" --run_maltextract --maltextract_ncbifiles "/home/runner/work/eager/eager/databases/maltextract/" --maltextract_taxon_list 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/maltextract/MaltExtract_list.txt'
- name: METAGENOMIC Run the basic pipeline but with unmapped reads going into Kraken
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_kraken,docker --run_bam_filtering --bam_discard_unmapped --bam_unmapped_type 'fastq'
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_kraken,docker --run_bam_filtering --bam_unmapped_type 'fastq'
- name: SEXDETERMINATION Run the basic pipeline with the bam input profile, but don't convert BAM, skip everything but sex determination
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_humanbam,docker --skip_fastqc --skip_adapterremoval --skip_deduplication --skip_qualimap --run_sexdeterrmine
Expand Down
7 changes: 4 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
* General documentation additions and cleaning, updated figures with CC-BY license
* Added large 'fullsize' dataset test-profiles for ancient fish, human, and a draft pathogen contexts.
* [#257](https://github.com/nf-core/eager/issues/257) Added the bowtie2 aligner as option for mapping, following Poullet and Orlando 2020 doi: [10.3389/fevo.2020.00105](https://doi.org/10.3389/fevo.2020.00105)
* [#451] Adds ANGSD genotype likelihood calculations as alternative to typical 'genotypers'
* [#504] Removed sexdeterrmine-snps plot from MultiQC report.
* [#451](https://github.com/nf-core/eager/issues/451) Adds ANGSD genotype likelihood calculations as alternative to typical 'genotypers'
* Nuclear contamination results are now shown in the MultiQC report.
* Tutorial on how to use profiles for reproducible science (i.e. parameter sharing between different groups)
* Added flexible trimming of bams by library type. 'half' and 'none' UDG libraries can now be trimmed differentially within a single eager run.
* [#522](https://github.com/nf-core/eager/issues/522) Added post-mapping length filter to asisst in more realistic endogenous DNA calculations
* [#512](https://github.com/nf-core/eager/issues/512) Added flexible trimming of bams by library type. 'half' and 'none' UDG libraries can now be trimmed differentially within a single eager run.

### `Fixed`

Expand All @@ -48,6 +48,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
* [#501](https://github.com/nf-core/eager/issues/501) - Adds additional validation checks for MALT/MaltExtract database input files
* [#508](https://github.com/nf-core/eager/issues/508) - Made Markduplicates default dedupper due to narrower context specificity of dedup
* [#516](https://github.com/nf-core/eager/issues/516) - Made bedtools not report out of memory exit code when warning of inconsistant FASTA/Bed entry names
* [#504](https://github.com/nf-core/eager/issues/504) - Removed uninformative sexdeterrmine-snps plot from MultiQC report.
* Nuclear contamination is now reported with the correct library names.

### `Dependencies`
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

![GitHub Actions CI Status](https://github.com/nf-core/eager/workflows/nf-core%20eager%20CI/badge.svg)
![GitHub Actions Linting Status](https://github.com/nf-core/eager/workflows/nf-core%20linting/badge.svg)
[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A519.10.0-brightgreen.svg)](https://www.nextflow.io/)
[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A520.04.0-brightgreen.svg)](https://www.nextflow.io/)
[![nf-core](https://img.shields.io/badge/nf--core-pipeline-brightgreen.svg)](https://nf-co.re/)
[![DOI](https://zenodo.org/badge/135918251.svg)](https://zenodo.org/badge/latestdoi/135918251)

Expand Down
5 changes: 3 additions & 2 deletions assets/multiqc_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ table_columns_visible:
percentage_aligned: False
MultiVCFAnalyzer:
Heterozygous SNP alleles (percent): True
custom_content:
endorS.py :
endogenous_dna: True
endogenous_dna_post: True
nuclear_contamination:
Expand Down Expand Up @@ -205,9 +205,10 @@ table_columns_placement:
Samtools Flagstat (post-samtools filter):
flagstat_total: 553
mapped_passed: 554
custom_content:
endorS.py :
endogenous_dna: 600
endogenous_dna_post: 610
nuclear_contamination:
Num_SNPs: 1100
Method1_MOM_estimate: 1110
Method1_MOM_SE: 1120
Expand Down
8 changes: 8 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,8 @@ Defines the adapter sequence to be used for the reverse read in paired end seque

Defines the minimum read length that is required for reads after merging to be considered for downstream analysis after read merging. Default is `30`.

Note that performing read length filtering at this step is not reliable for correct endogenous DNA calculation, when you have a large percentage of very short reads in your library. When you have very few reads passing this length filter, it will artificially inflate your endogenous DNA by creating a very small denominator. In these cases it is recommended to set this to 0, and use `--bam_filter_minreadlength` to instead, to filter out 'unusuable' short reads after mapping.

#### `--clip_min_read_quality`

Defines the minimum read quality per base that is required for a base to be kept. Individual bases at the ends of reads falling below this threshold will be clipped off. Default is set to `20`.
Expand Down Expand Up @@ -705,6 +707,12 @@ Note that in all cases, if `--bam_mapping_quality_threshold` is also supplied, m

Specify a mapping quality threshold for mapped reads to be kept for downstream analysis. By default keeps all reads and is therefore set to `0` (basically doesn't filter anything).

#### `bam_filter_minreadlength`

Specify minimum length of mapped reads. This filtering will happy at the same time as mapping quality filtering.
Comment thread
jfy133 marked this conversation as resolved.
Outdated

If used _instead_ of minimum length read filtering at AdapterRemoval, this can be useful to get more realistic endogenous DNA percentages, when most of your reads are very short and would otherwise be discarded by AdapterRemoval (thus making an artifically small denominator for a typical endogenous DNA calculation). Note in this context you should not perform mapping quality filtering nor discarding of unmapped reads to ensure a correct 'denominator' of 'all reads', for the Endogenous DNA calculation.

### Read DeDuplication Parameters

If using TSV input, deduplication is performed library, i.e. after lane merging.
Expand Down
113 changes: 84 additions & 29 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,8 @@ def helpMessage() {
--run_bam_filtering Turn on samtools filter for mapping quality or unmapped reads of BAM files.
--bam_mapping_quality_threshold Minimum mapping quality for reads filter. Default: ${params.bam_mapping_quality_threshold}
--bam_unmapped_type Defines whether to discard all unmapped reads, keep both mapped and unmapped together, or save as bam and/or only fastq format Options: 'discard', 'bam', 'keep', 'fastq', 'both'. Default: ${params.bam_unmapped_type}

--bam_filter_minreadlength Specify minimum read length to be kept after mapping.

DeDuplication
--dedupper Deduplication method to use. Options: 'dedup', 'markduplicates'. Default: '${params.dedupper}'
--dedup_all_merged Turn on treating all reads as merged reads.
Expand Down Expand Up @@ -1721,40 +1722,93 @@ process samtools_filter {
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.unmapped.fastq.gz") optional true into ch_bam_filtering_for_metagenomic
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.unmapped.bam") optional true

script:
def size = params.large_ref ? '-c' : ''
// Using shell block rather than script because we are playing with awk
shell:
size = !{params.large_ref} ? '-c' : ''

if ( "${params.bam_unmapped_type}" == "keep" ) {
"""
samtools view -h -b $bam -@ ${task.cpus} -q ${params.bam_mapping_quality_threshold} -o ${libraryid}.filtered.bam
samtools index ${libraryid}.filtered.bam ${size}
"""
'''
## Unmapped and MAPQ filtering
samtools view -h -b !{bam} -@ !{task.cpus} -q !{params.bam_mapping_quality_threshold} -o tmp_mapped.bam

## Mapped LEN filtering
if [[ !{minlength} -eq 0 ]]; then
mv tmp_mapped.bam !{libraryid}.filtered.bam
else
## From https://www.biostars.org/p/92889/#92908; note may not be optimal for un-merged reads (i.e. reads with long fragment lenths)
samtools view -h tmp_mapped.bam | awk 'length($10) >= !{minlength} || $1 ~ /^@/' | samtools view -bS - > !{libraryid}.filtered.bam
fi

samtools index !{libraryid}.filtered.bam !{size}
'''
} else if("${params.bam_unmapped_type}" == "discard"){
"""
samtools view -h -b $bam -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${libraryid}.filtered.bam
samtools index ${libraryid}.filtered.bam ${size}
"""
'''
## Unmapped and MAPQ filtering
samtools view -h -b !{bam} -@ !{task.cpus} -F4 -q !{params.bam_mapping_quality_threshold} -o tmp_mapped.bam

## Mapped LEN filtering
if [[ !{params.bam_filter_minreadlength} -eq 0 ]]; then
mv tmp_mapped.bam !{libraryid}.filtered.bam
else
## From https://www.biostars.org/p/92889/#92908; note may not be optimal for un-merged reads (i.e. reads with long fragment lenths)
samtools view -h tmp_mapped.bam | awk 'length($10) >= !{params.bam_filter_minreadlength} || $1 ~ /^@/' | samtools view -bS - > !{libraryid}.filtered.bam
fi

samtools index !{libraryid}.filtered.bam !{size}
'''
} else if("${params.bam_unmapped_type}" == "bam"){
"""
samtools view -h $bam | samtools view - -@ ${task.cpus} -f4 -o ${libraryid}.unmapped.bam
samtools view -h $bam | samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${libraryid}.filtered.bam
samtools index ${libraryid}.filtered.bam ${size}
"""
'''
## Unmapped and MAPQ filtering
samtools view -h !{bam} | samtools view - -@ !{task.cpus} -f4 -o !{libraryid}.unmapped.bam
samtools view -h !{bam} | samtools view - -@ !{task.cpus} -F4 -q !{params.bam_mapping_quality_threshold} -o tmp_mapped.bam

## Mapped LEN filtering
if [[ !{params.bam_filter_minreadlength} -eq 0 ]]; then
mv tmp_mapped.bam !{libraryid}.filtered.bam
else
## From https://www.biostars.org/p/92889/#92908; note may not be optimal for un-merged reads (i.e. reads with long fragment lenths)
samtools view -h tmp_mapped.bam | awk 'length($10) >= !{params.bam_filter_minreadlength} || $1 ~ /^@/' | samtools view -bS - > !{libraryid}.filtered.bam
fi

samtools index !{libraryid}.filtered.bam !{size}
'''
} else if("${params.bam_unmapped_type}" == "fastq"){
"""
samtools view -h $bam | samtools view - -@ ${task.cpus} -f4 -o ${libraryid}.unmapped.bam
samtools view -h $bam | samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${libraryid}.filtered.bam
samtools index ${libraryid}.filtered.bam ${size}
samtools fastq -tn ${libraryid}.unmapped.bam | pigz -p ${task.cpus} > ${libraryid}.unmapped.fastq.gz
rm ${libraryid}.unmapped.bam
"""
'''
## Unmapped and MAPQ filtering
samtools view -h !{bam} | samtools view - -@ !{task.cpus} -f4 -o !{libraryid}.unmapped.bam
samtools view -h !{bam} | samtools view - -@ !{task.cpus} -F4 -q !{params.bam_mapping_quality_threshold} -o tmp_mapped.bam

## Mapped LEN filtering
if [[ !{params.bam_filter_minreadlength} -eq 0 ]]; then
mv tmp_mapped.bam !{libraryid}.filtered.bam
else
## From https://www.biostars.org/p/92889/#92908; note may not be optimal for un-merged reads (i.e. reads with long fragment lenths)
samtools view -h tmp_mapped.bam | awk 'length($10) >= !{params.bam_filter_minreadlength} || $1 ~ /^@/' | samtools view -bS - > !{libraryid}.filtered.bam
fi

samtools index !{libraryid}.filtered.bam !{size}

## FASTQ
samtools fastq -tn !{libraryid}.unmapped.bam | pigz -p !{task.cpus} > !{libraryid}.unmapped.fastq.gz
rm !{libraryid}.unmapped.bam
'''
} else if("${params.bam_unmapped_type}" == "both"){
"""
samtools view -h $bam | samtools view - -@ ${task.cpus} -f4 -o ${libraryid}.unmapped.bam
samtools view -h $bam | samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${libraryid}.filtered.bam
samtools index ${libraryid}.filtered.bam ${size}
samtools fastq -tn ${libraryid}.unmapped.bam | pigz -p ${task.cpus} > ${libraryid}.unmapped.fastq.gz
"""
'''
## Unmapped and MAPQ filtering
samtools view -h !{bam} | samtools view - -@ !{task.cpus} -f4 -o !{libraryid}.unmapped.bam
samtools view -h !{bam} | samtools view - -@ !{task.cpus} -F4 -q !{params.bam_mapping_quality_threshold} -o tmp_mapped.bam

## Mapped LEN filtering
if [[ !{params.bam_filter_minreadlength} -eq 0 ]]; then
mv tmp_mapped.bam !{libraryid}.filtered.bam
else
## From https://www.biostars.org/p/92889/#92908; note may not be optimal for un-merged reads (i.e. reads with long fragment lenths)
samtools view -h tmp_mapped.bam | awk 'length($10) >= !{params.bam_filter_minreadlength} || $1 ~ /^@/' | samtools view -bS - > !{libraryid}.filtered.bam
fi

samtools index !{libraryid}.filtered.bam !{size}
samtools fastq -tn !{libraryid}.unmapped.bam | pigz -p !{task.cpus} > !{libraryid}.unmapped.fastq.gz
'''
}
}

Expand Down Expand Up @@ -1795,6 +1849,7 @@ process samtools_flagstat_after_filter {
if (params.run_bam_filtering) {
ch_flagstat_for_endorspy
.join(ch_bam_filtered_flagstat_for_endorspy, by: [0,1,2,3,4,5,6])
.dump(tag: "Joined")
.set{ ch_allflagstats_for_endorspy }

} else {
Expand Down
3 changes: 2 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ params {
run_bam_filtering = false
bam_unmapped_type = 'discard'
bam_mapping_quality_threshold = 0
bam_filter_minreadlength = 0

//DamageProfiler settings
damageprofiler_length = 100
Expand Down Expand Up @@ -339,7 +340,7 @@ manifest {
homePage = 'https://github.com/nf-core/eager'
description = 'A fully reproducible ancient and modern DNA pipeline in Nextflow and with cloud support.'
mainScript = 'main.nf'
nextflowVersion = '!>=19.10.0'
nextflowVersion = '!>=20.04.0'
version = '2.2.0dev'
}
// Function to ensure that resource requirements don't go beyond
Expand Down