diff --git a/CHANGELOG.md b/CHANGELOG.md index 15f1acc0e..3b6fb3d80 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,13 +13,14 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#836](https://github.com/nf-core/eager/issues/836) Remove deprecated parameters from test profiles - [#838](https://github.com/nf-core/eager/issues/836) Fix --snpcapture_bed files not being picked up by Nextflow - [#843](https://github.com/nf-core/eager/issues/843) Re-add direct piping of AdapterRemovalFixPrefix to pigz +- [#844](https://github.com/nf-core/eager/issues/844) Fixed reference masking prior to pmdtools - [#845](https://github.com/nf-core/eager/issues/845) Updates parameter documention to specify `-s` preseq parameter also applies to lc_extrap - [#851](https://github.com/nf-core/eager/issues/851) Fixes a file-name clash during additional_library_merge, post-BAM trimming of different UDG treated libraries of a sample -- Fix PMDtools reference mask not being picked up by Nextflow, and it's use being evaluated against --snpcapture_bed rather than --pmdtools_reference_mask - Renamed a range of MultiQC general stats table headers to improve clarity, documentation has been updated accordingly ### `Dependencies` +- [#829](https://github.com/nf-core/eager/issues/829) Bumped sequencetools: 1.4.0.5 -> 1.5.2 - Bumped MultiQC: 1.11 -> 1.12 (for run-time optimisation and tool citation information) ### `Deprecated` diff --git a/docs/output.md b/docs/output.md index d00dd69b8..804a52be5 100644 --- a/docs/output.md +++ b/docs/output.md @@ -675,6 +675,7 @@ This section gives a brief summary of where to look for what files for downstrea Each module has it's own output directory which sit alongside the `MultiQC/` directory from which you opened the report. * `reference_genome/`: this directory contains the indexing files of your input reference genome (i.e. the various `bwa` indices, a `samtools`' `.fai` file, and a picard `.dict`), if you used the `--saveReference` flag. + * When masking of the reference is requested prior to running pmdtools, an additional directory `reference_genome/masked_genome` will be found here, containing the masked reference. * `fastqc/`: this contains the original per-FASTQ FastQC reports that are summarised with MultiQC. These occur in both `html` (the report) and `.zip` format (raw data). The `after_clipping` folder contains the same but for after AdapterRemoval. * `adapterremoval/`: this contains the log files (ending with `.settings`) with raw trimming (and merging) statistics after AdapterRemoval. In the `output` sub-directory, are the output trimmed (and merged) `fastq` files. These you can use for downstream applications such as taxonomic binning for metagenomic studies. * `post_ar_fastq_trimmed`: this contains `fastq` files that have been additionally trimmed after AdapterRemoval (if turned on). These reads are usually that had internal barcodes, or damage that needed to be removed before mapping. diff --git a/environment.yml b/environment.yml index 03b2c6b4b..2a0230cdf 100644 --- a/environment.yml +++ b/environment.yml @@ -31,7 +31,7 @@ dependencies: - bioconda::bedtools=2.30.0 - conda-forge::libiconv=1.16 - conda-forge::pigz=2.6 - - bioconda::sequencetools=1.4.0.6 + - bioconda::sequencetools=1.5.2 - bioconda::preseq=3.1.2 - bioconda::fastp=0.20.1 - bioconda::bamutil=1.0.15 diff --git a/main.nf b/main.nf index f67247a05..8e3db7bca 100644 --- a/main.nf +++ b/main.nf @@ -185,7 +185,7 @@ if("${params.fasta}".endsWith(".gz")){ path zipped_fasta from file(params.fasta) // path doesn't like it if a string of an object is not prefaced with a root dir (/), so use file() to resolve string before parsing to `path` output: - path "$unzip" into ch_fasta into ch_fasta_for_bwaindex,ch_fasta_for_bt2index,ch_fasta_for_faidx,ch_fasta_for_seqdict,ch_fasta_for_circulargenerator,ch_fasta_for_circularmapper,ch_fasta_for_damageprofiler,ch_fasta_for_qualimap,ch_fasta_for_pmdtools,ch_fasta_for_genotyping_ug,ch_fasta_for_genotyping_hc,ch_fasta_for_genotyping_freebayes,ch_fasta_for_genotyping_pileupcaller,ch_fasta_for_vcf2genome,ch_fasta_for_multivcfanalyzer,ch_fasta_for_genotyping_angsd,ch_fasta_for_damagerescaling,ch_fasta_for_bcftools_stats + path "$unzip" into ch_fasta into ch_fasta_for_bwaindex,ch_fasta_for_bt2index,ch_fasta_for_faidx,ch_fasta_for_seqdict,ch_fasta_for_circulargenerator,ch_fasta_for_circularmapper,ch_fasta_for_damageprofiler,ch_fasta_for_qualimap,ch_unmasked_fasta_for_masking,ch_unmasked_fasta_for_pmdtools,ch_fasta_for_genotyping_ug,ch_fasta_for_genotyping_hc,ch_fasta_for_genotyping_freebayes,ch_fasta_for_genotyping_pileupcaller,ch_fasta_for_vcf2genome,ch_fasta_for_multivcfanalyzer,ch_fasta_for_genotyping_angsd,ch_fasta_for_damagerescaling,ch_fasta_for_bcftools_stats script: unzip = zipped_fasta.toString() - '.gz' @@ -196,7 +196,7 @@ if("${params.fasta}".endsWith(".gz")){ } else { fasta_for_indexing = Channel .fromPath("${params.fasta}", checkIfExists: true) - .into{ ch_fasta_for_bwaindex; ch_fasta_for_bt2index; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_genotyping_pileupcaller; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer;ch_fasta_for_genotyping_angsd;ch_fasta_for_damagerescaling;ch_fasta_for_bcftools_stats } + .into{ ch_fasta_for_bwaindex; ch_fasta_for_bt2index; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_unmasked_fasta_for_masking; ch_unmasked_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_genotyping_pileupcaller; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer;ch_fasta_for_genotyping_angsd;ch_fasta_for_damagerescaling;ch_fasta_for_bcftools_stats } } // Check that fasta index file path ends in '.fai' @@ -247,15 +247,16 @@ if ( !params.clip_adapters_list ) { } if ( params.snpcapture_bed ) { - Channel.fromPath(params.snpcapture_bed, checkIfExists: true).into { ch_snpcapture_bed; ch_snpcapture_bed_pmd } + Channel.fromPath(params.snpcapture_bed, checkIfExists: true).set { ch_snpcapture_bed } } else { - Channel.fromPath("$projectDir/assets/nf-core_eager_dummy.txt").into { ch_snpcapture_bed; ch_snpcapture_bed_pmd } + Channel.fromPath("$projectDir/assets/nf-core_eager_dummy.txt").set { ch_snpcapture_bed } } -if ( params.pmdtools_reference_mask ) { - ch_pmdtoolsmask = Channel.fromPath(params.pmdtools_reference_mask, checkIfExists: true) +// Set up channel with pmdtools reference mask bedfile +if (!params.pmdtools_reference_mask) { + ch_bedfile_for_reference_masking = Channel.fromPath("$projectDir/assets/nf-core_eager_dummy.txt") } else { - ch_pmdtoolsmask = Channel.fromPath("$projectDir/assets/nf-core_eager_dummy2.txt") + ch_bedfile_for_reference_masking = Channel.fromPath(params.pmdtools_reference_mask, checkIfExists: true) } // SexDetermination channel set up and bedfile validation @@ -2136,6 +2137,34 @@ process mapdamage_rescaling { // Optionally perform further aDNA evaluation or filtering for just reads with damage etc. +process mask_reference_for_pmdtools { + label 'sc_tiny' + tag "${fasta}" + publishDir "${params.outdir}/reference_genome/masked_reference", mode: params.publish_dir_mode + + when: (params.pmdtools_reference_mask && params.run_pmdtools) + + input: + file fasta from ch_unmasked_fasta_for_masking + file bedfile from ch_bedfile_for_reference_masking + + output: + file "${fasta.baseName}_masked.fa" into ch_masked_fasta_for_pmdtools + + script: + log.info "[nf-core/eager]: Masking reference \'${fasta}\' at positions found in \'${bedfile}\'. Masked reference will be used for pmdtools." + """ + bedtools maskfasta -fi ${fasta} -bed ${bedfile} -fo ${fasta.baseName}_masked.fa + """ +} + +// If masking was requested, use masked reference for pmdtools, else use original reference +if (params.pmdtools_reference_mask) { + ch_masked_fasta_for_pmdtools.set{ch_fasta_for_pmdtools} +} else { + ch_unmasked_fasta_for_pmdtools.set{ch_fasta_for_pmdtools} +} + process pmdtools { label 'mc_medium' tag "${libraryid}" @@ -2146,8 +2175,6 @@ process pmdtools { input: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(bam), path(bai) from ch_rmdup_for_pmdtools file fasta from ch_fasta_for_pmdtools.collect() - path snpcapture_bed from ch_snpcapture_bed_pmd - path pmdtools_reference_mask from ch_pmdtoolsmask output: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*.pmd.bam"), path("*.pmd.bam.{bai,csi}") into ch_output_from_pmdtools @@ -2156,18 +2183,16 @@ process pmdtools { script: //Check which treatment for the libraries was used def treatment = udg ? (udg == 'half' ? '--UDGhalf' : '--CpG') : '--UDGminus' - def snpcap = snpcapture_bed.getName() != 'nf-core_eager_dummy.txt' ? "--refseq ${pmdtools_reference_mask} --basecomposition" : '' - if ( snpcapture_bed.getName() != 'nf-core_eager_dummy.txt' && !params.pmdtools_reference_mask ) { log.info "[nf-core/eager] warn: No reference mask specified for PMDtools, therefore ignoring that for downstream analysis!" } def size = params.large_ref ? '-c' : '' def platypus = params.pmdtools_platypus ? '--platypus' : '' """ #Run Filtering step - samtools calmd ${bam} ${fasta} | pmdtools --threshold ${params.pmdtools_threshold} ${treatment} ${snpcap} --header | samtools view -Sb - > "${libraryid}".pmd.bam + samtools calmd ${bam} ${fasta} | pmdtools --threshold ${params.pmdtools_threshold} ${treatment} --header | samtools view -Sb - > "${libraryid}".pmd.bam #Run Calc Range step ## To allow early shut off of pipe: https://github.com/nextflow-io/nextflow/issues/1564 trap 'if [[ \$? == 141 ]]; then echo "Shutting samtools early due to -n parameter" && samtools index ${libraryid}.pmd.bam ${size}; exit 0; fi' EXIT - samtools calmd ${bam} ${fasta} | pmdtools --deamination ${platypus} --range ${params.pmdtools_range} ${treatment} ${snpcap} -n ${params.pmdtools_max_reads} > "${libraryid}".cpg.range."${params.pmdtools_range}".txt + samtools calmd ${bam} ${fasta} | pmdtools --deamination ${platypus} --range ${params.pmdtools_range} ${treatment} -n ${params.pmdtools_max_reads} > "${libraryid}".cpg.range."${params.pmdtools_range}".txt samtools index ${libraryid}.pmd.bam ${size} """ diff --git a/nextflow_schema.json b/nextflow_schema.json index 743473454..ff998a4b7 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -889,9 +889,9 @@ }, "pmdtools_reference_mask": { "type": "string", - "description": "Specify a path to reference mask for PMDTools.", + "description": "Specify a bedfile to be used to mask the reference fasta prior to running pmdtools.", "fa_icon": "fas fa-mask", - "help_text": "Can be used to set a path to a reference genome mask for PMDTools." + "help_text": "Activates masking of the reference fasta prior to running pmdtools. Positions that are in the provided bedfile will be replaced by Ns in the reference genome. This is useful for capture data, where you might not want the allele of a SNP to be counted as damage when it is a transition. Masking of the reference is done using `bedtools maskfasta`." }, "pmdtools_max_reads": { "type": "integer",