nf-core · jfy133 · Feb 10, 2020 · Feb 10, 2020 · Feb 11, 2020 · Feb 13, 2020
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -92,6 +92,9 @@ jobs:
       - name: SKIPPING Test checking all skip steps work i.e. input bam, skipping straight to genotyping
         run: |
           nextflow run ${GITHUB_WORKSPACE} "$TOWER" -name "$RUN_NAME-skipping_logic" -profile test_bam,docker --bam --single_end --skip_fastqc --skip_adapterremoval --skip_mapping --skip_deduplication --skip_qualimap --skip_preseq --skip_damage_calculation --run_genotyping --genotyping_tool 'freebayes'
+      - name: PMDTOOLS/TRIMBAM Checks trim bam and pmdtools works
+        run: |
+          nextflow run ${GITHUB_WORKSPACE} "$TOWER" -name "$RUN_NAME-pmdtools_trimbam" -profile test,docker --paired_end  --dedupper 'dedup' --run_trim_bam --bamutils_clip_left 2 --run_pmdtools --pmdtools_platypus --pmdtools_first
       #- name: TRIM_BAM/PMD/GENOTYPING_UG/MULTIVCFANALYZER Test running PMDTools, TrimBam, GATK UnifiedGenotyper and MultiVCFAnalyzer
       #  run: |
       #    nextflow run ${GITHUB_WORKSPACE} "$TOWER" -name "$RUN_NAME-pmd_trimbam_gatkUG_MVA" -profile test,docker --paired_end  --dedupper 'dedup' --run_trim_bam --run_pmdtools --run_genotyping --genotyping_source 'trimmed' --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' --run_multivcfanalyzer

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -25,6 +25,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 * [#286](https://github.com/nf-core/eager/issues/286) - Adds pipeline-specific profiles (loaded from nf-core configs)
 * [#310](https://github.com/nf-core/eager/issues/310) - Generalises base.config
 * [#326](https://github.com/nf-core/eager/pull/326) - Add Biopython and [xopen](https://github.com/marcelm/xopen/) dependencies
+* [#349](https://github.com/nf-core/eager/issues/349) - Adds two extra pmdtools parameters --first and --platypus to the damage analysis
 * [#336](https://github.com/nf-core/eager/issues/336) - Change default Y-axis maximum value of DamageProfiler to 30% to match popular (but slower) mapDamage, and allow user to set their own value.
 * [#352](https://github.com/nf-core/eager/pull/352) - Add social preview image
 * [#355](https://github.com/nf-core/eager/pull/355) - Add Kraken2 metagenomics classifier

diff --git a/docs/usage.md b/docs/usage.md
@@ -639,11 +639,11 @@ Specifies to run PMDTools for damage based read filtering and assessment of DNA
 
 #### `--udg` false
 
-Defines whether Uracil-DNA glycosylase (UDG) treatment was used to repair DNA damage on the sequencing libraries. If set, the parameter is used by downstream tools such as PMDTools to estimate damage only on CpG sites that are left after such a treatment.
+Defines whether any form of Uracil-DNA glycosylase (UDG) treatment was used to repair DNA damage on the sequencing libraries. If set, the parameter is used by downstream tools such as PMDTools to estimate damage only on CpG sites that are left after such a treatment.
 
-#### `--pmd_udg_type` \`half`
+#### `--pmdtools_udg_type`
 
-If you have UDGhalf treated data (Rohland et al 2016), specify `'half'` as option to this parameter to use a different model for DNA damage assessment in PMDTools. Specify the parameter with `'full'` and the DNA damage assesment will use CpG context only. If you don't specify the parameter at all, the library will be treated as non UDG treated.
+If you have UDGhalf treated data (Rohland et al 2016), specify `'half'` as option to this parameter to use a different model for DNA damage assessment in PMDTools. Specify the parameter with `'full'` and the DNA damage assessment will use CpG context only. If you don't specify the parameter at all, the library will be treated as non UDG treated.
 
 #### `--pmdtools_range`
 
@@ -655,11 +655,19 @@ Specifies the PMDScore threshold to use in the pipeline when filtering BAM files
 
 #### `--pmdtools_reference_mask`
 
-Can be used to set a reference genome mask for PMDTools.
+Can be used to set a reference genome mask for PMDTools. Default: none
 
 #### `--pmdtools_max_reads`
 
-The maximum number of reads used for damage assessment in PMDtools. Can be used to significantly reduce the amount of time required for damage assessment in PMDTools. Note that a too low value can also obtain incorrect results.
+The maximum number of reads used for damage assessment in PMDtools. Can be used to significantly reduce the amount of time required for damage assessment in PMDTools. Note that a too low value can also obtain incorrect results. Default: 10000
+
+#### `--pmdtools_first`
+
+Outputs the deamination rate at the first position only, but with a standard error. Default: off
+
+#### `--pmdtools_platypus`
+
+Specifies to analyse CpG dinucleotides separately. This can be useful for analysing UDG-treated mammalian nuclear DNA, as CpG-context cytosines are almost always methylated and ot affect by UDG treatment, and can allow you to visualise damage patterns on UDG treated data. Default: off
 
 ### BAM Trimming Parameters
 

diff --git a/main.nf b/main.nf
@@ -107,12 +107,14 @@ def helpMessage() {
       --damageprofiler_threshold    Specify number of bases to consider for damageProfiler (e.g. on damage plot). Default: 15
       --damageprofiler_yaxis        Specify the maximum misincorporation frequency that should be displayed on damage plot. Set to 0 to 'autoscale'. Default: 0.30 
       --run_pmdtools                Turn on PMDtools
-      --udg_type                    Specify here if you have UDG half treated libraries, Set to 'half' in that case, or 'full' for UDG+. If not set, libraries are set to UDG-.
+      --pmdtools_udg_type                    Specify here if you have UDG-half or UDG-full treated libraries. Set to 'half' or 'full' respectively. If not set, libraries are set to no-UDG treatment.
       --pmdtools_range              Specify range of bases for PMDTools
       --pmdtools_threshold          Specify PMDScore threshold for PMDTools
       --pmdtools_reference_mask     Specify a reference mask for PMDTools
       --pmdtools_max_reads          Specify the max. number of reads to consider for metrics generation
-
+      --pmdtools_first              Specify to output the deamination rate at the first position only, but with a standard error. Default: Off
+      --pmdtools_platypus           Specify to analyse CpG di-nucleotides separately as not affected by UDG treatment. Default: Off
+
     Annotation Statistics
       --run_bedtools_coverage       Turn on ability to calculate no. reads, depth and breadth coverage of features in reference
       --anno_file                   Path to GFF or BED file containing positions of features in reference file (--fasta). Path should be enclosed in quotes
@@ -141,10 +143,10 @@ def helpMessage() {
       --freebayes_g                   Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified in --freebayes_C. Default: turned off.
       --freebayes_p                   Specify ploidy of sample in FreeBayes. Default: 2 (diploid).
 
-    Concensus Sequence Generation
-      --run_vcf2genome              Turns on ability to create a concensus sequence FASTA file based on a UnifiedGenotyper VCF file and the original reference (only considers SNPs).
-      --vcf2genome_outfile          Specify name of the output FASTA file containing the concensus sequence. Do not inclvcf2 Default: '<input_vcf>'
-      --vcf2genome_header           Specify the header name of the concensus sequence entry within the FASTA file. Default: '<input_vcf>'
+    Consensus Sequence Generation
+      --run_vcf2genome              Turns on ability to create a consensus sequence FASTA file based on a UnifiedGenotyper VCF file and the original reference (only considers SNPs).
+      --vcf2genome_outfile          Specify name of the output FASTA file containing the consensus sequence. Default: '<input_vcf>.fasta'
+      --vcf2genome_header           Specify the header name of the consensus sequence entry within the FASTA file. Default: '<input_vcf>'
       --vcf2genome_minc             Minimum depth coverage required for a call to be included (else N will be called). Default: 5
       --vcf2genome_minq             Minimum genotyping quality of a call to be called. Else N will be called. Default: 30
       --vcf2genome_minfreq          Minimum fraction of reads supporting a call to be included. Else N will be called. Default: 0.8
@@ -1648,7 +1650,7 @@ process pmdtools {
 
     script:
     //Check which treatment for the libraries was used
-    def treatment = params.pmd_udg_type ? (params.pmd_udg_type =='half' ? '--UDGhalf' : '--CpG') : '--UDGminus'
+    def treatment = params.pmdtools_udg_type ? (params.pmdtools_udg_type =='half' ? '--UDGhalf' : '--CpG') : '--UDGminus'
     if(params.snpcapture){
         snpcap = (params.pmdtools_reference_mask != '') ? "--refseq ${params.pmdtools_reference_mask}" : ''
         log.info"######No reference mask specified for PMDtools, therefore ignoring that for downstream analysis!"
@@ -1657,11 +1659,13 @@ process pmdtools {
     }
     size = "${params.large_ref}" ? '-c' : ''
     prefix = "${bam.baseName}"
+    first = "${params.pmdtools_first}" ? '--first' : ''
+    platypus = "${params.pmdtools_platypus}" ? '--platypus' : ''
     """
     #Run Filtering step 
     samtools calmd -b $bam $fasta | samtools view -h - | pmdtools --threshold ${params.pmdtools_threshold} $treatment $snpcap --header | samtools view -@ ${task.cpus} -Sb - > "${bam.baseName}".pmd.bam
     #Run Calc Range step
-    samtools calmd -b $bam $fasta | samtools view -h - | pmdtools --deamination --range ${params.pmdtools_range} $treatment $snpcap -n ${params.pmdtools_max_reads} > "${bam.baseName}".cpg.range."${params.pmdtools_range}".txt 
+    samtools calmd -b $bam $fasta | samtools view -h - | pmdtools --deamination --range ${params.pmdtools_range} $treatment $snpcap -n ${params.pmdtools_max_reads} > "${bam.baseName}".cpg.range."${params.pmdtools_range}".txt "${first}" "${platypus}"
     samtools index "${size}" ${prefix}.pmd.bam
     """
 }
@@ -1730,8 +1734,14 @@ if ( params.run_genotyping && params.genotyping_source == 'raw' ) {
         .into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes } 
     ch_rmdupindex_for_skipdamagemanipulation
         .into { ch_damagemanipulationindex_for_skipgenotyping; ch_damagemanipulationindex_for_genotyping_hc; ch_damagemanipulationindex_for_genotyping_freebayes }
+} else if ( !params.run_genotyping && params.run_trim_bam && params.run_pmdtools )  {
+    ch_rmdup_for_skipdamagemanipulation
+        .into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes } 
+    ch_rmdupindex_for_skipdamagemanipulation
+        .into { ch_damagemanipulationindex_for_skipgenotyping; ch_damagemanipulationindex_for_genotyping_hc; ch_damagemanipulationindex_for_genotyping_freebayes }
 }
 
+
 /*
  Step 12b: Genotyping - UG
  NB: GATK 3.5 is the last release with VCF output in "old" VCF format, not breaking MVA. Therefore we need it (for now at least until downstream tools can read proper 4.2 VCFs... )

diff --git a/nextflow.config b/nextflow.config
@@ -96,11 +96,13 @@ params {
 
   //PMDTools settings
   run_pmdtools = false
-  pmd_udg_type = 'half'
+  pmdtools_udg_type = 'half'
   pmdtools_range = 10
   pmdtools_threshold = 3
   pmdtools_reference_mask = ''
   pmdtools_max_reads = 10000
+  pmdtools_first = false
+  pmdtools_platypus = false
 
   //bamUtils trimbam settings
   run_trim_bam = false