Skip to content

Commit 5b55c86

Browse files
committed
Add mapDamage rescaling
1 parent ea352d9 commit 5b55c86

10 files changed

Lines changed: 106 additions & 16 deletions

File tree

.github/workflows/ci.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,3 +186,6 @@ jobs:
186186
- name: MTNUCRATIO Run basic pipeline with bam input profile, but don't convert BAM, skip everything but nmtnucratio
187187
run: |
188188
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_humanbam,docker --skip_fastqc --skip_adapterremoval --skip_deduplication --skip_qualimap --skip_preseq --skip_damage_calculation --run_mtnucratio
189+
- name: RESCALING Run basic pipeline with basic pipeline but with mapDamage rescaling of BAM files. Note this will be slow
190+
run: |
191+
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_mapdamage_rescaling --run_genotyping --genotyping_tool hc --genotyping_source 'rescaled'

CHANGELOG.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,13 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
77

88
### `Added`
99

10+
- [#583](https://github.com/nf-core/eager/issues/583) - mapDamage2 rescaling of BAM files to remove damage
11+
1012
### `Fixed`
1113

1214
- Removed leftover old DockerHub push CI commands.
13-
- [#627](https://github.com/nf-core/eager/issues/627) Added de Barros Damgaard citation to README
14-
- [#630](https://github.com/nf-core/eager/pull/630) Better handling of Qualimap memory requirements and error strategy.
15+
- [#627](https://github.com/nf-core/eager/issues/627) - Added de Barros Damgaard citation to README
16+
- [#630](https://github.com/nf-core/eager/pull/630) - Better handling of Qualimap memory requirements and error strategy.
1517

1618
### `Dependencies`
1719

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,7 @@ In addition, references of tools and data used in this pipeline are as follows:
236236
* **Bowtie2** Langmead, B. and Salzberg, S. L. 2012 Fast gapped-read alignment with Bowtie 2. Nature methods, 9(4), p. 357–359. doi: [10.1038/nmeth.1923](https:/dx.doi.org/10.1038/nmeth.1923).
237237
* **sequenceTools** Stephan Schiffels (Unpublished). Download: [https://github.com/stschiff/sequenceTools](https://github.com/stschiff/sequenceTools)
238238
* **EigenstratDatabaseTools** Thiseas C. Lamnidis (Unpublished). Download: [https://github.com/TCLamnidis/EigenStratDatabaseTools.git](https://github.com/TCLamnidis/EigenStratDatabaseTools.git)
239+
* **mapDamage2** Jónsson, H., et al 2013. mapDamage2.0: fast approximate Bayesian estimates of ancient DNA damage parameters. Bioinformatics , 29(13), 1682–1684. [https://doi.org/10.1093/bioinformatics/btt193](https://doi.org/10.1093/bioinformatics/btt193)
239240
240241
## Data References
241242

bin/scrape_software_versions.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@
3535
'VCF2genome':['v_vcf2genome.txt', r"VCF2Genome \(v. ([0-9].[0-9]+) "],
3636
'endorS.py':['v_endorSpy.txt', r"endorS.py (\S+)"],
3737
'kraken':['v_kraken.txt', r"Kraken version (\S+)"],
38-
'eigenstrat_snp_coverage':['v_eigenstrat_snp_coverage.txt',r"(\S+)"]
38+
'eigenstrat_snp_coverage':['v_eigenstrat_snp_coverage.txt',r"(\S+)"],
39+
'mapDamage2':['v_mapdamage.txt',r"(\S+)"],
3940
}
4041

4142
results = OrderedDict()
@@ -55,7 +56,7 @@
5556
results['Qualimap'] = '<span style="color:#999999;\">N/A</span>'
5657
results['Preseq'] = '<span style="color:#999999;\">N/A</span>'
5758
results['GATK HaplotypeCaller'] = '<span style="color:#999999;\">N/A</span>'
58-
#results['GATK UnifiedGenotyper'] = '<span style="color:#999999;\">N/A</span>'
59+
results['GATK UnifiedGenotyper'] = '<span style="color:#999999;\">N/A</span>'
5960
results['freebayes'] = '<span style="color:#999999;\">N/A</span>'
6061
results['sequenceTools'] = '<span style="color:#999999;\">N/A</span>'
6162
results['VCF2genome'] = '<span style="color:#999999;\">N/A</span>'
@@ -71,6 +72,7 @@
7172
results['kraken'] = '<span style="color:#999999;\">N/A</span>'
7273
results['maltextract'] = '<span style="color:#999999;\">N/A</span>'
7374
results['eigenstrat_snp_coverage'] = '<span style="color:#999999;\">N/A</span>'
75+
results['mapDamage2'] = '<span style="color:#999999;\">N/A</span>'
7476

7577
# Search each file using its regex
7678
for k, v in regexes.items():

conf/test_resources.config

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,4 +51,8 @@ process {
5151
time = { check_max( 10.m * task.attempt, 'time' ) }
5252
}
5353

54+
withName:'mapdamage_rescaling'{
55+
time = { check_max( 20.m * task.attempt, 'time' ) }
56+
}
57+
5458
}

docs/output.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -658,6 +658,7 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir
658658
- `damageprofiler/` - this contains sample specific directories containing raw statistics and damage plots from DamageProfiler. The `.pdf` files can be used to visualise C to T miscoding lesions or read length distributions of your mapped reads. All raw statistics used for the PDF plots are contained in the `.txt` files.
659659
- `pmdtools/` - this contains raw output statistics of pmdtools (estimates of frequencies of substitutions), and BAM files which have been filtered to remove reads that do not have a Post-mortem damage (PMD) score of `--pmdtools_threshold`.
660660
- `trimmed_bam/` - this contains the BAM files with X number of bases trimmed off as defined with the `--bamutils_clip_half_udg_left`, `--bamutils_clip_half_udg_right`, `--bamutils_clip_none_udg_left`, and `--bamutils_clip_none_udg_right` flags and corresponding index files. You can use these BAM files for downstream analysis such as re-mapping data with more stringent parameters (if you set trimming to remove the most likely places containing damage in the read).
661+
- `damage_rescaling/` - this contains rescaled BAM files from mapDamage2. These BAM files have damage probabilistically removed via a bayesian model, and can be used for downstream genotyping.
661662
- `genotyping/` - this contains all the (gzipped) genotyping files produced by your genotyping module. The file suffix will have the genotyping tool name. You will have files corresponding to each of your deduplicated BAM files (except pileupcaller), or any turned-on downstream processes that create BAMs (e.g. trimmed bams or pmd tools). If `--gatk_ug_keep_realign_bam` supplied, this may also contain BAM files from InDel realignment when using GATK 3 and UnifiedGenotyping for variant calling. When pileupcaller is used to create eigenstrat genotypes, this directory also contains eigenstrat SNP coverage statistics.
662663
- `multivcfanalyzer/` - this contains all output from MultiVCFAnalyzer, including SNP calling statistics, various SNP table(s) and FASTA alignment files.
663664
- `sex_determination/` - this contains the output for the sex determination run. This is a single `.tsv` file that includes a table with the sample name, the number of autosomal SNPs, number of SNPs on the X/Y chromosome, the number of reads mapping to the autosomes, the number of reads mapping to the X/Y chromosome, the relative coverage on the X/Y chromosomes, and the standard error associated with the relative coverages. These measures are provided for each bam file, one row per file. If the `sexdeterrmine_bedfile` option has not been provided, the error bars cannot be trusted, and runtime will be considerably longer.

environment.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,4 @@ dependencies:
4747
- conda-forge::xopen=0.9.0
4848
- bioconda::bowtie2=2.4.1
4949
- bioconda::eigenstratdatabasetools=1.0.2
50+
- bioconda::mapdamage2=2.2.0

main.nf

Lines changed: 56 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -116,12 +116,15 @@ def helpMessage() {
116116
--damageprofiler_length [num] Specify length filter for DamageProfiler. Default: ${params.damageprofiler_length}
117117
--damageprofiler_threshold [num] Specify number of bases of each read to consider for DamageProfiler calculations. Default: ${params.damageprofiler_threshold}
118118
--damageprofiler_yaxis [float] Specify the maximum misincorporation frequency that should be displayed on damage plot. Set to 0 to 'autoscale'. Default: ${params.damageprofiler_yaxis}
119+
--run_mapdamage_rescaling Turn on damage rescaling of BAM files using mapDamage2 to probabilistically remove damage.
120+
--rescale_length_5p Length of read for mapDamage2 to rescale from 5p end. Default: ${params.rescale_length_5p}
121+
--rescale_length_3p Length of read for mapDamage2 to rescale from 5p end. Default: ${params.rescale_length_3p}
119122
--run_pmdtools [bool] Turn on PMDtools
120123
--pmdtools_range [num] Specify range of bases for PMDTools. Default: ${params.pmdtools_range}
121124
--pmdtools_threshold [num] Specify PMDScore threshold for PMDTools. Default: ${params.pmdtools_threshold}
122125
--pmdtools_reference_mask [file] Specify a path to reference mask for PMDTools.
123126
--pmdtools_max_reads [num] Specify the maximum number of reads to consider for metrics generation. Default: ${params.pmdtools_max_reads}
124-
127+
125128
Annotation Statistics
126129
--run_bedtools_coverage [bool] Turn on ability to calculate no. reads, depth and breadth coverage of features in reference.
127130
--anno_file [file] Path to GFF or BED file containing positions of features in reference file (--fasta). Path should be enclosed in quotes.
@@ -137,7 +140,7 @@ def helpMessage() {
137140
Genotyping
138141
--run_genotyping [bool] Turn on genotyping of BAM files.
139142
--genotyping_tool [str] Specify which genotyper to use either GATK UnifiedGenotyper, GATK HaplotypeCaller, Freebayes, or pileupCaller. Options: 'ug', 'hc', 'freebayes', 'pileupcaller', 'angsd'.
140-
--genotyping_source [str] Specify which input BAM to use for genotyping. Options: 'raw', 'trimmed' or 'pmd'. Default: '${params.genotyping_source}'
143+
--genotyping_source [str] Specify which input BAM to use for genotyping. Options: 'raw', 'trimmed', 'pmd', 'rescaled'. Default: '${params.genotyping_source}'
141144
--gatk_call_conf [num] Specify GATK phred-scaled confidence threshold. Default: ${params.gatk_call_conf}
142145
--gatk_ploidy [num] Specify GATK organism ploidy. Default: ${params.gatk_ploidy}
143146
--gatk_downsample [num] Maximum depth coverage allowed for genotyping before down-sampling is turned on. Default: ${params.gatk_downsample}
@@ -285,7 +288,7 @@ if("${params.fasta}".endsWith(".gz")){
285288
path zipped_fasta from file(params.fasta) // path doesn't like it if a string of an object is not prefaced with a root dir (/), so use file() to resolve string before parsing to `path`
286289

287290
output:
288-
path "$unzip" into ch_fasta into ch_fasta_for_bwaindex,ch_fasta_for_bt2index,ch_fasta_for_faidx,ch_fasta_for_seqdict,ch_fasta_for_circulargenerator,ch_fasta_for_circularmapper,ch_fasta_for_damageprofiler,ch_fasta_for_qualimap,ch_fasta_for_pmdtools,ch_fasta_for_genotyping_ug,ch_fasta_for_genotyping_hc,ch_fasta_for_genotyping_freebayes,ch_fasta_for_genotyping_pileupcaller,ch_fasta_for_vcf2genome,ch_fasta_for_multivcfanalyzer,ch_fasta_for_genotyping_angsd
291+
path "$unzip" into ch_fasta into ch_fasta_for_bwaindex,ch_fasta_for_bt2index,ch_fasta_for_faidx,ch_fasta_for_seqdict,ch_fasta_for_circulargenerator,ch_fasta_for_circularmapper,ch_fasta_for_damageprofiler,ch_fasta_for_qualimap,ch_fasta_for_pmdtools,ch_fasta_for_genotyping_ug,ch_fasta_for_genotyping_hc,ch_fasta_for_genotyping_freebayes,ch_fasta_for_genotyping_pileupcaller,ch_fasta_for_vcf2genome,ch_fasta_for_multivcfanalyzer,ch_fasta_for_genotyping_angsd,ch_fasta_for_damagerescaling
289292

290293
script:
291294
unzip = zipped_fasta.toString() - '.gz'
@@ -296,7 +299,7 @@ if("${params.fasta}".endsWith(".gz")){
296299
} else {
297300
fasta_for_indexing = Channel
298301
.fromPath("${params.fasta}", checkIfExists: true)
299-
.into{ ch_fasta_for_bwaindex; ch_fasta_for_bt2index; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_genotyping_pileupcaller; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer;ch_fasta_for_genotyping_angsd }
302+
.into{ ch_fasta_for_bwaindex; ch_fasta_for_bt2index; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_genotyping_pileupcaller; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer;ch_fasta_for_genotyping_angsd;ch_fasta_for_damagerescaling }
300303
}
301304

302305
// Check that fasta index file path ends in '.fai'
@@ -413,8 +416,13 @@ if (params.sexdeterrmine_bedfile == '') {
413416

414417
// Genotyping validation
415418
if (params.run_genotyping){
419+
420+
if (params.genotyping_source != 'raw' && params.genotyping_source != 'pmd' && params.genotyping_source != 'trimmed' && params.genotyping_source != 'rescaled' ) {
421+
exit 1, "[nf-core/eager] error: please specify a valid genotyping source. Options: 'raw', 'pmd', 'trimmed', 'rescaled'. Found parameter: --genotyping_source '${params.genotyping_source}'."
422+
}
423+
416424
if (params.genotyping_tool != 'ug' && params.genotyping_tool != 'hc' && params.genotyping_tool != 'freebayes' && params.genotyping_tool != 'pileupcaller' && params.genotyping_tool != 'angsd' ) {
417-
exit 1, "[nf-core/eager] error: please specify a genotyper. Options: 'ug', 'hc', 'freebayes', 'pileupcaller'. Found parameter: --genotyping_tool '${params.genotyping_tool}'."
425+
exit 1, "[nf-core/eager] error: please specify a valid genotyper. Options: 'ug', 'hc', 'freebayes', 'pileupcaller'. Found parameter: --genotyping_tool '${params.genotyping_tool}'."
418426
}
419427

420428
if (params.gatk_ug_out_mode != 'EMIT_VARIANTS_ONLY' && params.gatk_ug_out_mode != 'EMIT_ALL_CONFIDENT_SITES' && params.gatk_ug_out_mode != 'EMIT_ALL_SITES') {
@@ -2172,11 +2180,11 @@ process library_merge {
21722180
if (!params.skip_deduplication) {
21732181
ch_input_for_skiplibrarymerging.mix(ch_output_from_librarymerging)
21742182
.filter { it =~/.*_rmdup.bam/ }
2175-
.into { ch_rmdup_for_skipdamagemanipulation; ch_rmdup_for_pmdtools; ch_rmdup_for_bamutils; ch_rmdup_for_bedtools }
2183+
.into { ch_rmdup_for_skipdamagemanipulation; ch_rmdup_for_pmdtools; ch_rmdup_for_bamutils; ch_rmdup_for_bedtools; ch_rmdup_for_damagerescaling }
21762184

21772185
} else {
21782186
ch_input_for_skiplibrarymerging.mix(ch_output_from_librarymerging)
2179-
.into { ch_rmdup_for_skipdamagemanipulation; ch_rmdup_for_pmdtools; ch_rmdup_for_bamutils; ch_rmdup_for_bedtools }
2187+
.into { ch_rmdup_for_skipdamagemanipulation; ch_rmdup_for_pmdtools; ch_rmdup_for_bamutils; ch_rmdup_for_bedtools; ch_rmdup_for_damagerescaling }
21802188
}
21812189

21822190
//////////////////////////////////////////////////
@@ -2282,7 +2290,37 @@ process damageprofiler {
22822290
"""
22832291
}
22842292

2285-
// Optionally perform further aDNA evaluation or filtering for just reads with damage etc.
2293+
// Damage rescaling with mapDamage
2294+
2295+
process mapdamage_rescaling {
2296+
2297+
label 'sc_small'
2298+
tag "${libraryid}"
2299+
2300+
publishDir "${params.outdir}/damage_rescaling", mode: params.publish_dir_mode
2301+
2302+
when:
2303+
params.run_mapdamage_rescaling
2304+
2305+
input:
2306+
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(bam), path(bai) from ch_rmdup_for_damagerescaling
2307+
file fasta from ch_fasta_for_damagerescaling.collect()
2308+
2309+
output:
2310+
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*_rescaled.bam"), path("*rescaled.bam.{bai,csi}") into ch_output_from_damagerescaling
2311+
2312+
script:
2313+
def base = "${bam.baseName}"
2314+
def singlestranded = strandedness == "single" ? '--single-stranded' : ''
2315+
def size = params.large_ref ? '-c' : ''
2316+
"""
2317+
mapDamage -i ${bam} -r ${fasta} --rescale --rescale-out ${bam}_rescaled.bam --rescale-length-5p ${params.rescale_length_5p} --rescale-length-3p=${params.rescale_length_3p} ${singlestranded}
2318+
samtools index ${bam}_rescaled.bam ${size}
2319+
"""
2320+
2321+
}
2322+
2323+
// Optionally perform further aDNA evaluation or filtering for just reads with damage etc.
22862324

22872325
process pmdtools {
22882326
label 'mc_small'
@@ -2369,7 +2407,7 @@ process bam_trim {
23692407
"""
23702408
}
23712409

2372-
// Post trimming merging of libraries to single samples, except for SS/DS
2410+
// Post-trimming merging of libraries to single samples, except for SS/DS
23732411
// libraries as they should be genotyped separately, because we will assume
23742412
// that if trimming is turned on, 'lab-removed' libraries can be combined with
23752413
// merged with 'in-silico damage removed' libraries to improve genotyping
@@ -2464,12 +2502,19 @@ if ( params.run_genotyping && params.genotyping_source == 'raw' ) {
24642502
.into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes; ch_damagemanipulation_for_genotyping_pileupcaller; ch_damagemanipulation_for_genotyping_angsd }
24652503

24662504
} else if ( params.run_genotyping && params.genotyping_source == "pmd" && !params.run_pmdtools ) {
2467-
exit 1, "[nf-core/eager] error: Cannot run genotyping with 'pmd' source without running pmtools (--run_pmdtools)! Please check input parameters."
2505+
exit 1, "[nf-core/eager] error: Cannot run genotyping with 'pmd' source without running pmdtools (--run_pmdtools)! Please check input parameters."
24682506

24692507
} else if ( params.run_genotyping && params.genotyping_source == "pmd" && params.run_pmdtools ) {
24702508
ch_output_from_pmdtools
24712509
.into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes; ch_damagemanipulation_for_genotyping_pileupcaller; ch_damagemanipulation_for_genotyping_angsd }
24722510

2511+
} else if ( params.run_genotyping && params.genotyping_source == "rescaled" && params.run_mapdamage_rescaling) {
2512+
ch_output_from_damagerescaling
2513+
.into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes; ch_damagemanipulation_for_genotyping_pileupcaller; ch_damagemanipulation_for_genotyping_angsd }
2514+
2515+
} else if ( params.run_genotyping && params.genotyping_source == "rescaled" && !params.run_mapdamage_rescaling) {
2516+
exit 1, "[nf-core/eager] error: Cannot run genotyping with 'rescaled' source without running damage rescaling (--run_damagescaling)! Please check input parameters."
2517+
24732518
} else if ( !params.run_genotyping && !params.run_trim_bam && !params.run_pmdtools ) {
24742519
ch_rmdup_for_skipdamagemanipulation
24752520
.into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes; ch_damagemanipulation_for_genotyping_pileupcaller; ch_damagemanipulation_for_genotyping_angsd }
@@ -3165,6 +3210,7 @@ process get_software_versions {
31653210
pileupCaller --version &> v_sequencetools.txt 2>&1 || true
31663211
bowtie2 --version | grep -a 'bowtie2-.* -fdebug' > v_bowtie2.txt || true
31673212
eigenstrat_snp_coverage --version | cut -d ' ' -f2 >v_eigenstrat_snp_coverage.txt || true
3213+
mapDamage2 --version > v_mapdamage.txt || true
31683214
31693215
scrape_software_versions.py &> software_versions_mqc.yaml
31703216
"""

nextflow.config

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,10 @@ params {
110110
pmdtools_reference_mask = ''
111111
pmdtools_max_reads = 10000
112112

113+
// mapDamage
114+
run_mapdamage_rescaling = false
115+
params.rescale_length_5p = 12
116+
params.rescale_length_3p = 12
113117

114118
//Bedtools settings
115119
run_bedtools_coverage = false

0 commit comments

Comments
 (0)