Skip to content

Commit 7e262c9

Browse files
authored
Merge branch 'dev' into circularmapper-filter-fix
2 parents cfe19dd + b760924 commit 7e262c9

10 files changed

Lines changed: 107 additions & 16 deletions

File tree

.github/workflows/ci.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,3 +186,6 @@ jobs:
186186
- name: MTNUCRATIO Run basic pipeline with bam input profile, but don't convert BAM, skip everything but nmtnucratio
187187
run: |
188188
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_humanbam,docker --skip_fastqc --skip_adapterremoval --skip_deduplication --skip_qualimap --skip_preseq --skip_damage_calculation --run_mtnucratio
189+
- name: RESCALING Run basic pipeline with basic pipeline but with mapDamage rescaling of BAM files. Note this will be slow
190+
run: |
191+
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_mapdamage_rescaling --run_genotyping --genotyping_tool hc --genotyping_source 'rescaled'

CHANGELOG.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,14 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
77

88
### `Added`
99

10+
- [#583](https://github.com/nf-core/eager/issues/583) - mapDamage2 rescaling of BAM files to remove damage
11+
1012
### `Fixed`
1113

1214
- Removed leftover old DockerHub push CI commands.
13-
- [#627](https://github.com/nf-core/eager/issues/627) Added de Barros Damgaard citation to README
14-
- [#630](https://github.com/nf-core/eager/pull/630) Better handling of Qualimap memory requirements and error strategy.
15+
- [#627](https://github.com/nf-core/eager/issues/627) - Added de Barros Damgaard citation to README
16+
- [#630](https://github.com/nf-core/eager/pull/630) - Better handling of Qualimap memory requirements and error strategy.
17+
- Fixed some incomplete schema options to ensure users supply valid input values
1518
- [#638](https://github.com/nf-core/eager/issues/638#issuecomment-748877567) Fixed inverted circularfilter filtering (previously filtering would happen by default, not when requested by user as originally recorded in documentation)
1619

1720
### `Dependencies`

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,7 @@ In addition, references of tools and data used in this pipeline are as follows:
236236
* **Bowtie2** Langmead, B. and Salzberg, S. L. 2012 Fast gapped-read alignment with Bowtie 2. Nature methods, 9(4), p. 357–359. doi: [10.1038/nmeth.1923](https:/dx.doi.org/10.1038/nmeth.1923).
237237
* **sequenceTools** Stephan Schiffels (Unpublished). Download: [https://github.com/stschiff/sequenceTools](https://github.com/stschiff/sequenceTools)
238238
* **EigenstratDatabaseTools** Thiseas C. Lamnidis (Unpublished). Download: [https://github.com/TCLamnidis/EigenStratDatabaseTools.git](https://github.com/TCLamnidis/EigenStratDatabaseTools.git)
239+
* **mapDamage2** Jónsson, H., et al 2013. mapDamage2.0: fast approximate Bayesian estimates of ancient DNA damage parameters. Bioinformatics , 29(13), 1682–1684. [https://doi.org/10.1093/bioinformatics/btt193](https://doi.org/10.1093/bioinformatics/btt193)
239240
240241
## Data References
241242

bin/scrape_software_versions.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@
3535
'VCF2genome':['v_vcf2genome.txt', r"VCF2Genome \(v. ([0-9].[0-9]+) "],
3636
'endorS.py':['v_endorSpy.txt', r"endorS.py (\S+)"],
3737
'kraken':['v_kraken.txt', r"Kraken version (\S+)"],
38-
'eigenstrat_snp_coverage':['v_eigenstrat_snp_coverage.txt',r"(\S+)"]
38+
'eigenstrat_snp_coverage':['v_eigenstrat_snp_coverage.txt',r"(\S+)"],
39+
'mapDamage2':['v_mapdamage.txt',r"(\S+)"],
3940
}
4041

4142
results = OrderedDict()
@@ -55,7 +56,7 @@
5556
results['Qualimap'] = '<span style="color:#999999;\">N/A</span>'
5657
results['Preseq'] = '<span style="color:#999999;\">N/A</span>'
5758
results['GATK HaplotypeCaller'] = '<span style="color:#999999;\">N/A</span>'
58-
#results['GATK UnifiedGenotyper'] = '<span style="color:#999999;\">N/A</span>'
59+
results['GATK UnifiedGenotyper'] = '<span style="color:#999999;\">N/A</span>'
5960
results['freebayes'] = '<span style="color:#999999;\">N/A</span>'
6061
results['sequenceTools'] = '<span style="color:#999999;\">N/A</span>'
6162
results['VCF2genome'] = '<span style="color:#999999;\">N/A</span>'
@@ -71,6 +72,7 @@
7172
results['kraken'] = '<span style="color:#999999;\">N/A</span>'
7273
results['maltextract'] = '<span style="color:#999999;\">N/A</span>'
7374
results['eigenstrat_snp_coverage'] = '<span style="color:#999999;\">N/A</span>'
75+
results['mapDamage2'] = '<span style="color:#999999;\">N/A</span>'
7476

7577
# Search each file using its regex
7678
for k, v in regexes.items():

conf/test_resources.config

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,4 +51,8 @@ process {
5151
time = { check_max( 10.m * task.attempt, 'time' ) }
5252
}
5353

54+
withName:'mapdamage_rescaling'{
55+
time = { check_max( 20.m * task.attempt, 'time' ) }
56+
}
57+
5458
}

docs/output.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -658,6 +658,7 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir
658658
- `damageprofiler/` - this contains sample specific directories containing raw statistics and damage plots from DamageProfiler. The `.pdf` files can be used to visualise C to T miscoding lesions or read length distributions of your mapped reads. All raw statistics used for the PDF plots are contained in the `.txt` files.
659659
- `pmdtools/` - this contains raw output statistics of pmdtools (estimates of frequencies of substitutions), and BAM files which have been filtered to remove reads that do not have a Post-mortem damage (PMD) score of `--pmdtools_threshold`.
660660
- `trimmed_bam/` - this contains the BAM files with X number of bases trimmed off as defined with the `--bamutils_clip_half_udg_left`, `--bamutils_clip_half_udg_right`, `--bamutils_clip_none_udg_left`, and `--bamutils_clip_none_udg_right` flags and corresponding index files. You can use these BAM files for downstream analysis such as re-mapping data with more stringent parameters (if you set trimming to remove the most likely places containing damage in the read).
661+
- `damage_rescaling/` - this contains rescaled BAM files from mapDamage2. These BAM files have damage probabilistically removed via a bayesian model, and can be used for downstream genotyping.
661662
- `genotyping/` - this contains all the (gzipped) genotyping files produced by your genotyping module. The file suffix will have the genotyping tool name. You will have files corresponding to each of your deduplicated BAM files (except pileupcaller), or any turned-on downstream processes that create BAMs (e.g. trimmed bams or pmd tools). If `--gatk_ug_keep_realign_bam` supplied, this may also contain BAM files from InDel realignment when using GATK 3 and UnifiedGenotyping for variant calling. When pileupcaller is used to create eigenstrat genotypes, this directory also contains eigenstrat SNP coverage statistics.
662663
- `multivcfanalyzer/` - this contains all output from MultiVCFAnalyzer, including SNP calling statistics, various SNP table(s) and FASTA alignment files.
663664
- `sex_determination/` - this contains the output for the sex determination run. This is a single `.tsv` file that includes a table with the sample name, the number of autosomal SNPs, number of SNPs on the X/Y chromosome, the number of reads mapping to the autosomes, the number of reads mapping to the X/Y chromosome, the relative coverage on the X/Y chromosomes, and the standard error associated with the relative coverages. These measures are provided for each bam file, one row per file. If the `sexdeterrmine_bedfile` option has not been provided, the error bars cannot be trusted, and runtime will be considerably longer.

environment.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,4 @@ dependencies:
4747
- conda-forge::xopen=0.9.0
4848
- bioconda::bowtie2=2.4.1
4949
- bioconda::eigenstratdatabasetools=1.0.2
50+
- bioconda::mapdamage2=2.2.0

main.nf

Lines changed: 56 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -116,12 +116,15 @@ def helpMessage() {
116116
--damageprofiler_length [num] Specify length filter for DamageProfiler. Default: ${params.damageprofiler_length}
117117
--damageprofiler_threshold [num] Specify number of bases of each read to consider for DamageProfiler calculations. Default: ${params.damageprofiler_threshold}
118118
--damageprofiler_yaxis [float] Specify the maximum misincorporation frequency that should be displayed on damage plot. Set to 0 to 'autoscale'. Default: ${params.damageprofiler_yaxis}
119+
--run_mapdamage_rescaling Turn on damage rescaling of BAM files using mapDamage2 to probabilistically remove damage.
120+
--rescale_length_5p Length of read for mapDamage2 to rescale from 5p end. Default: ${params.rescale_length_5p}
121+
--rescale_length_3p Length of read for mapDamage2 to rescale from 5p end. Default: ${params.rescale_length_3p}
119122
--run_pmdtools [bool] Turn on PMDtools
120123
--pmdtools_range [num] Specify range of bases for PMDTools. Default: ${params.pmdtools_range}
121124
--pmdtools_threshold [num] Specify PMDScore threshold for PMDTools. Default: ${params.pmdtools_threshold}
122125
--pmdtools_reference_mask [file] Specify a path to reference mask for PMDTools.
123126
--pmdtools_max_reads [num] Specify the maximum number of reads to consider for metrics generation. Default: ${params.pmdtools_max_reads}
124-
127+
125128
Annotation Statistics
126129
--run_bedtools_coverage [bool] Turn on ability to calculate no. reads, depth and breadth coverage of features in reference.
127130
--anno_file [file] Path to GFF or BED file containing positions of features in reference file (--fasta). Path should be enclosed in quotes.
@@ -137,7 +140,7 @@ def helpMessage() {
137140
Genotyping
138141
--run_genotyping [bool] Turn on genotyping of BAM files.
139142
--genotyping_tool [str] Specify which genotyper to use either GATK UnifiedGenotyper, GATK HaplotypeCaller, Freebayes, or pileupCaller. Options: 'ug', 'hc', 'freebayes', 'pileupcaller', 'angsd'.
140-
--genotyping_source [str] Specify which input BAM to use for genotyping. Options: 'raw', 'trimmed' or 'pmd'. Default: '${params.genotyping_source}'
143+
--genotyping_source [str] Specify which input BAM to use for genotyping. Options: 'raw', 'trimmed', 'pmd', 'rescaled'. Default: '${params.genotyping_source}'
141144
--gatk_call_conf [num] Specify GATK phred-scaled confidence threshold. Default: ${params.gatk_call_conf}
142145
--gatk_ploidy [num] Specify GATK organism ploidy. Default: ${params.gatk_ploidy}
143146
--gatk_downsample [num] Maximum depth coverage allowed for genotyping before down-sampling is turned on. Default: ${params.gatk_downsample}
@@ -285,7 +288,7 @@ if("${params.fasta}".endsWith(".gz")){
285288
path zipped_fasta from file(params.fasta) // path doesn't like it if a string of an object is not prefaced with a root dir (/), so use file() to resolve string before parsing to `path`
286289

287290
output:
288-
path "$unzip" into ch_fasta into ch_fasta_for_bwaindex,ch_fasta_for_bt2index,ch_fasta_for_faidx,ch_fasta_for_seqdict,ch_fasta_for_circulargenerator,ch_fasta_for_circularmapper,ch_fasta_for_damageprofiler,ch_fasta_for_qualimap,ch_fasta_for_pmdtools,ch_fasta_for_genotyping_ug,ch_fasta_for_genotyping_hc,ch_fasta_for_genotyping_freebayes,ch_fasta_for_genotyping_pileupcaller,ch_fasta_for_vcf2genome,ch_fasta_for_multivcfanalyzer,ch_fasta_for_genotyping_angsd
291+
path "$unzip" into ch_fasta into ch_fasta_for_bwaindex,ch_fasta_for_bt2index,ch_fasta_for_faidx,ch_fasta_for_seqdict,ch_fasta_for_circulargenerator,ch_fasta_for_circularmapper,ch_fasta_for_damageprofiler,ch_fasta_for_qualimap,ch_fasta_for_pmdtools,ch_fasta_for_genotyping_ug,ch_fasta_for_genotyping_hc,ch_fasta_for_genotyping_freebayes,ch_fasta_for_genotyping_pileupcaller,ch_fasta_for_vcf2genome,ch_fasta_for_multivcfanalyzer,ch_fasta_for_genotyping_angsd,ch_fasta_for_damagerescaling
289292

290293
script:
291294
unzip = zipped_fasta.toString() - '.gz'
@@ -296,7 +299,7 @@ if("${params.fasta}".endsWith(".gz")){
296299
} else {
297300
fasta_for_indexing = Channel
298301
.fromPath("${params.fasta}", checkIfExists: true)
299-
.into{ ch_fasta_for_bwaindex; ch_fasta_for_bt2index; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_genotyping_pileupcaller; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer;ch_fasta_for_genotyping_angsd }
302+
.into{ ch_fasta_for_bwaindex; ch_fasta_for_bt2index; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_genotyping_pileupcaller; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer;ch_fasta_for_genotyping_angsd;ch_fasta_for_damagerescaling }
300303
}
301304

302305
// Check that fasta index file path ends in '.fai'
@@ -413,8 +416,13 @@ if (params.sexdeterrmine_bedfile == '') {
413416

414417
// Genotyping validation
415418
if (params.run_genotyping){
419+
420+
if (params.genotyping_source != 'raw' && params.genotyping_source != 'pmd' && params.genotyping_source != 'trimmed' && params.genotyping_source != 'rescaled' ) {
421+
exit 1, "[nf-core/eager] error: please specify a valid genotyping source. Options: 'raw', 'pmd', 'trimmed', 'rescaled'. Found parameter: --genotyping_source '${params.genotyping_source}'."
422+
}
423+
416424
if (params.genotyping_tool != 'ug' && params.genotyping_tool != 'hc' && params.genotyping_tool != 'freebayes' && params.genotyping_tool != 'pileupcaller' && params.genotyping_tool != 'angsd' ) {
417-
exit 1, "[nf-core/eager] error: please specify a genotyper. Options: 'ug', 'hc', 'freebayes', 'pileupcaller'. Found parameter: --genotyping_tool '${params.genotyping_tool}'."
425+
exit 1, "[nf-core/eager] error: please specify a valid genotyper. Options: 'ug', 'hc', 'freebayes', 'pileupcaller'. Found parameter: --genotyping_tool '${params.genotyping_tool}'."
418426
}
419427

420428
if (params.gatk_ug_out_mode != 'EMIT_VARIANTS_ONLY' && params.gatk_ug_out_mode != 'EMIT_ALL_CONFIDENT_SITES' && params.gatk_ug_out_mode != 'EMIT_ALL_SITES') {
@@ -2173,11 +2181,11 @@ process library_merge {
21732181
if (!params.skip_deduplication) {
21742182
ch_input_for_skiplibrarymerging.mix(ch_output_from_librarymerging)
21752183
.filter { it =~/.*_rmdup.bam/ }
2176-
.into { ch_rmdup_for_skipdamagemanipulation; ch_rmdup_for_pmdtools; ch_rmdup_for_bamutils; ch_rmdup_for_bedtools }
2184+
.into { ch_rmdup_for_skipdamagemanipulation; ch_rmdup_for_pmdtools; ch_rmdup_for_bamutils; ch_rmdup_for_bedtools; ch_rmdup_for_damagerescaling }
21772185

21782186
} else {
21792187
ch_input_for_skiplibrarymerging.mix(ch_output_from_librarymerging)
2180-
.into { ch_rmdup_for_skipdamagemanipulation; ch_rmdup_for_pmdtools; ch_rmdup_for_bamutils; ch_rmdup_for_bedtools }
2188+
.into { ch_rmdup_for_skipdamagemanipulation; ch_rmdup_for_pmdtools; ch_rmdup_for_bamutils; ch_rmdup_for_bedtools; ch_rmdup_for_damagerescaling }
21812189
}
21822190

21832191
//////////////////////////////////////////////////
@@ -2283,7 +2291,37 @@ process damageprofiler {
22832291
"""
22842292
}
22852293

2286-
// Optionally perform further aDNA evaluation or filtering for just reads with damage etc.
2294+
// Damage rescaling with mapDamage
2295+
2296+
process mapdamage_rescaling {
2297+
2298+
label 'sc_small'
2299+
tag "${libraryid}"
2300+
2301+
publishDir "${params.outdir}/damage_rescaling", mode: params.publish_dir_mode
2302+
2303+
when:
2304+
params.run_mapdamage_rescaling
2305+
2306+
input:
2307+
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(bam), path(bai) from ch_rmdup_for_damagerescaling
2308+
file fasta from ch_fasta_for_damagerescaling.collect()
2309+
2310+
output:
2311+
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*_rescaled.bam"), path("*rescaled.bam.{bai,csi}") into ch_output_from_damagerescaling
2312+
2313+
script:
2314+
def base = "${bam.baseName}"
2315+
def singlestranded = strandedness == "single" ? '--single-stranded' : ''
2316+
def size = params.large_ref ? '-c' : ''
2317+
"""
2318+
mapDamage -i ${bam} -r ${fasta} --rescale --rescale-out ${bam}_rescaled.bam --rescale-length-5p ${params.rescale_length_5p} --rescale-length-3p=${params.rescale_length_3p} ${singlestranded}
2319+
samtools index ${bam}_rescaled.bam ${size}
2320+
"""
2321+
2322+
}
2323+
2324+
// Optionally perform further aDNA evaluation or filtering for just reads with damage etc.
22872325

22882326
process pmdtools {
22892327
label 'mc_small'
@@ -2370,7 +2408,7 @@ process bam_trim {
23702408
"""
23712409
}
23722410

2373-
// Post trimming merging of libraries to single samples, except for SS/DS
2411+
// Post-trimming merging of libraries to single samples, except for SS/DS
23742412
// libraries as they should be genotyped separately, because we will assume
23752413
// that if trimming is turned on, 'lab-removed' libraries can be combined with
23762414
// merged with 'in-silico damage removed' libraries to improve genotyping
@@ -2465,12 +2503,19 @@ if ( params.run_genotyping && params.genotyping_source == 'raw' ) {
24652503
.into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes; ch_damagemanipulation_for_genotyping_pileupcaller; ch_damagemanipulation_for_genotyping_angsd }
24662504

24672505
} else if ( params.run_genotyping && params.genotyping_source == "pmd" && !params.run_pmdtools ) {
2468-
exit 1, "[nf-core/eager] error: Cannot run genotyping with 'pmd' source without running pmtools (--run_pmdtools)! Please check input parameters."
2506+
exit 1, "[nf-core/eager] error: Cannot run genotyping with 'pmd' source without running pmdtools (--run_pmdtools)! Please check input parameters."
24692507

24702508
} else if ( params.run_genotyping && params.genotyping_source == "pmd" && params.run_pmdtools ) {
24712509
ch_output_from_pmdtools
24722510
.into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes; ch_damagemanipulation_for_genotyping_pileupcaller; ch_damagemanipulation_for_genotyping_angsd }
24732511

2512+
} else if ( params.run_genotyping && params.genotyping_source == "rescaled" && params.run_mapdamage_rescaling) {
2513+
ch_output_from_damagerescaling
2514+
.into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes; ch_damagemanipulation_for_genotyping_pileupcaller; ch_damagemanipulation_for_genotyping_angsd }
2515+
2516+
} else if ( params.run_genotyping && params.genotyping_source == "rescaled" && !params.run_mapdamage_rescaling) {
2517+
exit 1, "[nf-core/eager] error: Cannot run genotyping with 'rescaled' source without running damage rescaling (--run_damagescaling)! Please check input parameters."
2518+
24742519
} else if ( !params.run_genotyping && !params.run_trim_bam && !params.run_pmdtools ) {
24752520
ch_rmdup_for_skipdamagemanipulation
24762521
.into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes; ch_damagemanipulation_for_genotyping_pileupcaller; ch_damagemanipulation_for_genotyping_angsd }
@@ -3166,6 +3211,7 @@ process get_software_versions {
31663211
pileupCaller --version &> v_sequencetools.txt 2>&1 || true
31673212
bowtie2 --version | grep -a 'bowtie2-.* -fdebug' > v_bowtie2.txt || true
31683213
eigenstrat_snp_coverage --version | cut -d ' ' -f2 >v_eigenstrat_snp_coverage.txt || true
3214+
mapDamage2 --version > v_mapdamage.txt || true
31693215
31703216
scrape_software_versions.py &> software_versions_mqc.yaml
31713217
"""

nextflow.config

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,10 @@ params {
110110
pmdtools_reference_mask = ''
111111
pmdtools_max_reads = 10000
112112

113+
// mapDamage
114+
run_mapdamage_rescaling = false
115+
params.rescale_length_5p = 12
116+
params.rescale_length_3p = 12
113117

114118
//Bedtools settings
115119
run_bedtools_coverage = false

0 commit comments

Comments
 (0)