diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a8c26c622..ba4606c8d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -92,6 +92,9 @@ jobs: - name: SKIPPING Test checking all skip steps work i.e. input bam, skipping straight to genotyping run: | nextflow run ${GITHUB_WORKSPACE} "$TOWER" -name "$RUN_NAME-skipping_logic" -profile test_bam,docker --bam --single_end --skip_fastqc --skip_adapterremoval --skip_mapping --skip_deduplication --skip_qualimap --skip_preseq --skip_damage_calculation --run_genotyping --genotyping_tool 'freebayes' + - name: PMDTOOLS/TRIMBAM Checks trim bam and pmdtools works + run: | + nextflow run ${GITHUB_WORKSPACE} "$TOWER" -name "$RUN_NAME-pmdtools_trimbam" -profile test,docker --paired_end --dedupper 'dedup' --run_trim_bam --bamutils_clip_left 2 --run_pmdtools --pmdtools_platypus --pmdtools_first #- name: TRIM_BAM/PMD/GENOTYPING_UG/MULTIVCFANALYZER Test running PMDTools, TrimBam, GATK UnifiedGenotyper and MultiVCFAnalyzer # run: | # nextflow run ${GITHUB_WORKSPACE} "$TOWER" -name "$RUN_NAME-pmd_trimbam_gatkUG_MVA" -profile test,docker --paired_end --dedupper 'dedup' --run_trim_bam --run_pmdtools --run_genotyping --genotyping_source 'trimmed' --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' --run_multivcfanalyzer diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e297d9a9..61c76c6b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. * [#286](https://github.com/nf-core/eager/issues/286) - Adds pipeline-specific profiles (loaded from nf-core configs) * [#310](https://github.com/nf-core/eager/issues/310) - Generalises base.config * [#326](https://github.com/nf-core/eager/pull/326) - Add Biopython and [xopen](https://github.com/marcelm/xopen/) dependencies +* [#349](https://github.com/nf-core/eager/issues/349) - Adds two extra pmdtools parameters --first and --platypus to the damage analysis * [#336](https://github.com/nf-core/eager/issues/336) - Change default Y-axis maximum value of DamageProfiler to 30% to match popular (but slower) mapDamage, and allow user to set their own value. * [#352](https://github.com/nf-core/eager/pull/352) - Add social preview image * [#355](https://github.com/nf-core/eager/pull/355) - Add Kraken2 metagenomics classifier diff --git a/docs/usage.md b/docs/usage.md index 4de682f7d..e5fa71d6e 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -639,11 +639,11 @@ Specifies to run PMDTools for damage based read filtering and assessment of DNA #### `--udg` false -Defines whether Uracil-DNA glycosylase (UDG) treatment was used to repair DNA damage on the sequencing libraries. If set, the parameter is used by downstream tools such as PMDTools to estimate damage only on CpG sites that are left after such a treatment. +Defines whether any form of Uracil-DNA glycosylase (UDG) treatment was used to repair DNA damage on the sequencing libraries. If set, the parameter is used by downstream tools such as PMDTools to estimate damage only on CpG sites that are left after such a treatment. -#### `--pmd_udg_type` \`half` +#### `--pmdtools_udg_type` -If you have UDGhalf treated data (Rohland et al 2016), specify `'half'` as option to this parameter to use a different model for DNA damage assessment in PMDTools. Specify the parameter with `'full'` and the DNA damage assesment will use CpG context only. If you don't specify the parameter at all, the library will be treated as non UDG treated. +If you have UDGhalf treated data (Rohland et al 2016), specify `'half'` as option to this parameter to use a different model for DNA damage assessment in PMDTools. Specify the parameter with `'full'` and the DNA damage assessment will use CpG context only. If you don't specify the parameter at all, the library will be treated as non UDG treated. #### `--pmdtools_range` @@ -655,11 +655,19 @@ Specifies the PMDScore threshold to use in the pipeline when filtering BAM files #### `--pmdtools_reference_mask` -Can be used to set a reference genome mask for PMDTools. +Can be used to set a reference genome mask for PMDTools. Default: none #### `--pmdtools_max_reads` -The maximum number of reads used for damage assessment in PMDtools. Can be used to significantly reduce the amount of time required for damage assessment in PMDTools. Note that a too low value can also obtain incorrect results. +The maximum number of reads used for damage assessment in PMDtools. Can be used to significantly reduce the amount of time required for damage assessment in PMDTools. Note that a too low value can also obtain incorrect results. Default: 10000 + +#### `--pmdtools_first` + +Outputs the deamination rate at the first position only, but with a standard error. Default: off + +#### `--pmdtools_platypus` + +Specifies to analyse CpG dinucleotides separately. This can be useful for analysing UDG-treated mammalian nuclear DNA, as CpG-context cytosines are almost always methylated and ot affect by UDG treatment, and can allow you to visualise damage patterns on UDG treated data. Default: off ### BAM Trimming Parameters diff --git a/main.nf b/main.nf index a47fcf030..706ceedfa 100644 --- a/main.nf +++ b/main.nf @@ -107,12 +107,14 @@ def helpMessage() { --damageprofiler_threshold Specify number of bases to consider for damageProfiler (e.g. on damage plot). Default: 15 --damageprofiler_yaxis Specify the maximum misincorporation frequency that should be displayed on damage plot. Set to 0 to 'autoscale'. Default: 0.30 --run_pmdtools Turn on PMDtools - --udg_type Specify here if you have UDG half treated libraries, Set to 'half' in that case, or 'full' for UDG+. If not set, libraries are set to UDG-. + --pmdtools_udg_type Specify here if you have UDG-half or UDG-full treated libraries. Set to 'half' or 'full' respectively. If not set, libraries are set to no-UDG treatment. --pmdtools_range Specify range of bases for PMDTools --pmdtools_threshold Specify PMDScore threshold for PMDTools --pmdtools_reference_mask Specify a reference mask for PMDTools --pmdtools_max_reads Specify the max. number of reads to consider for metrics generation - + --pmdtools_first Specify to output the deamination rate at the first position only, but with a standard error. Default: Off + --pmdtools_platypus Specify to analyse CpG di-nucleotides separately as not affected by UDG treatment. Default: Off + Annotation Statistics --run_bedtools_coverage Turn on ability to calculate no. reads, depth and breadth coverage of features in reference --anno_file Path to GFF or BED file containing positions of features in reference file (--fasta). Path should be enclosed in quotes @@ -141,10 +143,10 @@ def helpMessage() { --freebayes_g Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified in --freebayes_C. Default: turned off. --freebayes_p Specify ploidy of sample in FreeBayes. Default: 2 (diploid). - Concensus Sequence Generation - --run_vcf2genome Turns on ability to create a concensus sequence FASTA file based on a UnifiedGenotyper VCF file and the original reference (only considers SNPs). - --vcf2genome_outfile Specify name of the output FASTA file containing the concensus sequence. Do not inclvcf2 Default: '' - --vcf2genome_header Specify the header name of the concensus sequence entry within the FASTA file. Default: '' + Consensus Sequence Generation + --run_vcf2genome Turns on ability to create a consensus sequence FASTA file based on a UnifiedGenotyper VCF file and the original reference (only considers SNPs). + --vcf2genome_outfile Specify name of the output FASTA file containing the consensus sequence. Default: '.fasta' + --vcf2genome_header Specify the header name of the consensus sequence entry within the FASTA file. Default: '' --vcf2genome_minc Minimum depth coverage required for a call to be included (else N will be called). Default: 5 --vcf2genome_minq Minimum genotyping quality of a call to be called. Else N will be called. Default: 30 --vcf2genome_minfreq Minimum fraction of reads supporting a call to be included. Else N will be called. Default: 0.8 @@ -1648,7 +1650,7 @@ process pmdtools { script: //Check which treatment for the libraries was used - def treatment = params.pmd_udg_type ? (params.pmd_udg_type =='half' ? '--UDGhalf' : '--CpG') : '--UDGminus' + def treatment = params.pmdtools_udg_type ? (params.pmdtools_udg_type =='half' ? '--UDGhalf' : '--CpG') : '--UDGminus' if(params.snpcapture){ snpcap = (params.pmdtools_reference_mask != '') ? "--refseq ${params.pmdtools_reference_mask}" : '' log.info"######No reference mask specified for PMDtools, therefore ignoring that for downstream analysis!" @@ -1657,11 +1659,13 @@ process pmdtools { } size = "${params.large_ref}" ? '-c' : '' prefix = "${bam.baseName}" + first = "${params.pmdtools_first}" ? '--first' : '' + platypus = "${params.pmdtools_platypus}" ? '--platypus' : '' """ #Run Filtering step samtools calmd -b $bam $fasta | samtools view -h - | pmdtools --threshold ${params.pmdtools_threshold} $treatment $snpcap --header | samtools view -@ ${task.cpus} -Sb - > "${bam.baseName}".pmd.bam #Run Calc Range step - samtools calmd -b $bam $fasta | samtools view -h - | pmdtools --deamination --range ${params.pmdtools_range} $treatment $snpcap -n ${params.pmdtools_max_reads} > "${bam.baseName}".cpg.range."${params.pmdtools_range}".txt + samtools calmd -b $bam $fasta | samtools view -h - | pmdtools --deamination --range ${params.pmdtools_range} $treatment $snpcap -n ${params.pmdtools_max_reads} > "${bam.baseName}".cpg.range."${params.pmdtools_range}".txt "${first}" "${platypus}" samtools index "${size}" ${prefix}.pmd.bam """ } @@ -1730,8 +1734,14 @@ if ( params.run_genotyping && params.genotyping_source == 'raw' ) { .into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes } ch_rmdupindex_for_skipdamagemanipulation .into { ch_damagemanipulationindex_for_skipgenotyping; ch_damagemanipulationindex_for_genotyping_hc; ch_damagemanipulationindex_for_genotyping_freebayes } +} else if ( !params.run_genotyping && params.run_trim_bam && params.run_pmdtools ) { + ch_rmdup_for_skipdamagemanipulation + .into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes } + ch_rmdupindex_for_skipdamagemanipulation + .into { ch_damagemanipulationindex_for_skipgenotyping; ch_damagemanipulationindex_for_genotyping_hc; ch_damagemanipulationindex_for_genotyping_freebayes } } + /* Step 12b: Genotyping - UG NB: GATK 3.5 is the last release with VCF output in "old" VCF format, not breaking MVA. Therefore we need it (for now at least until downstream tools can read proper 4.2 VCFs... ) diff --git a/nextflow.config b/nextflow.config index 11bd9159f..f23247f6a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -96,11 +96,13 @@ params { //PMDTools settings run_pmdtools = false - pmd_udg_type = 'half' + pmdtools_udg_type = 'half' pmdtools_range = 10 pmdtools_threshold = 3 pmdtools_reference_mask = '' pmdtools_max_reads = 10000 + pmdtools_first = false + pmdtools_platypus = false //bamUtils trimbam settings run_trim_bam = false