diff --git a/assets/schema_input.json b/assets/schema_input.json index 0a1a6d4f6..a05958303 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -73,6 +73,18 @@ "type": "string", "meta": ["bam_reference_id"], "errorMessage": "A BAM reference ID (corresponding to what is supplied to `--fasta`) must always be provided when providing a BAM file." + }, + "vcf": { + "type": "string", + "format": "file-path", + "pattern": "^\\S+\\.vcf.gz$", + "exists": true, + "errorMessage": "VCFs files cannot contain any spaces and must have extension '.vcf.gz' and be gzip zipped." + }, + "vcf_reference_id": { + "type": "string", + "meta": ["vcf_reference_id"], + "errorMessage": "A VCF reference ID (corresponding to what is supplied to `--fasta`) must always be provided when providing a VCF file." } }, "required": [ @@ -90,18 +102,24 @@ }, { "required": ["bam"] + }, + { + "required": ["vcf"] } ], "dependentRequired": { "r2": ["r1"], "bam": ["bam_reference_id"], - "bam_reference_id": ["bam"] + "bam_reference_id": ["bam"], + "vcf" : ["vcf_reference_id"], + "vcf_reference_id" : ["vcf"] } }, "allOf": [ { "uniqueEntries": ["lane", "library_id"] }, { "uniqueEntries": "r1" }, { "uniqueEntries": "r2" }, - { "uniqueEntries": "bam" } + { "uniqueEntries": "bam" }, + { "uniqueEntries": "vcf" } ] } diff --git a/conf/modules.config b/conf/modules.config index 069ae1256..c771c6436 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1757,4 +1757,35 @@ process { ] ] } + + withName: UG_BGZIP { + tag = { "${meta.reference}|${meta.sample_id}" } + ext.args = "-d" + } + + withName: REF_MVA_GUNZIP { + tag = { "${meta.reference}" } + ext.args = "-d" + } + + withName: MULTIVCFANALYZER { + tag = { "${meta.reference}" } + ext.prefix = { "multivcfanalyzer_${meta.reference}" } + publishDir = [ + [ + // data + path: { "${params.outdir}/consensus_sequence/data/" }, + mode: params.publish_dir_mode, + enabled: true, + pattern: '*.fastq.gz' + ], + [ + // stats + path: { "${params.outdir}/consensus_sequence/stats/" }, + mode: params.publish_dir_mode, + enabled: true, + pattern: '*.{txt,tsv}' + ] + ] + } } diff --git a/main.nf b/main.nf index 5096b1a78..c482c845a 100644 --- a/main.nf +++ b/main.nf @@ -45,6 +45,7 @@ workflow NFCORE_EAGER { take: samplesheet_fastqs // channel: samplesheet read in from --input samplesheet_bams + samplesheet_vcfs main: @@ -53,7 +54,8 @@ workflow NFCORE_EAGER { // EAGER ( samplesheet_fastqs, - samplesheet_bams + samplesheet_bams, + samplesheet_vcfs ) emit: multiqc_report = EAGER.out.multiqc_report // channel: /path/to/multiqc_report.html @@ -88,6 +90,7 @@ workflow { NFCORE_EAGER ( PIPELINE_INITIALISATION.out.samplesheet_fastqs, PIPELINE_INITIALISATION.out.samplesheet_bams, + PIPELINE_INITIALISATION.out.samplesheet_vcfs ) // // SUBWORKFLOW: Run completion tasks diff --git a/modules.json b/modules.json index 9015e2ff9..01e81c966 100644 --- a/modules.json +++ b/modules.json @@ -8,142 +8,198 @@ "adapterremoval": { "branch": "master", "git_sha": "5add1e8e11af620c779462936ce8bbcc1abcef2d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "amps": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "angsd/contamination": { "branch": "master", "git_sha": "3c77ca9aac783e76c3614a06db3bfe4fef619bde", - "installed_by": ["bam_docounts_contamination_angsd"] + "installed_by": [ + "bam_docounts_contamination_angsd" + ] }, "angsd/docounts": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["bam_docounts_contamination_angsd"] + "installed_by": [ + "bam_docounts_contamination_angsd" + ] }, "angsd/gl": { "branch": "master", "git_sha": "c22aa6082716bd372cbb8f7ccf7c83220f180864", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bamutil/trimbam": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bbmap/bbduk": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bcftools/index": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bcftools/stats": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bedtools/coverage": { "branch": "master", "git_sha": "1d1cb7bfef6cf67fbc7faafa6992ad8bdc3045b3", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bedtools/maskfasta": { "branch": "master", "git_sha": "3b248b84694d1939ac4bb33df84bf6233a34d668", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bowtie2/align": { "branch": "master", "git_sha": "3c77ca9aac783e76c3614a06db3bfe4fef619bde", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bowtie2/build": { "branch": "master", "git_sha": "1fea64f5132a813ec97c1c6d3a74e0aee7142b6d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bwa/aln": { "branch": "master", "git_sha": "e0ff65e1fb313677de09f5f477ae3da30ce19b7b", - "installed_by": ["fastq_align_bwaaln"] + "installed_by": [ + "fastq_align_bwaaln" + ] }, "bwa/index": { "branch": "master", "git_sha": "e0ff65e1fb313677de09f5f477ae3da30ce19b7b", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bwa/mem": { "branch": "master", "git_sha": "e0ff65e1fb313677de09f5f477ae3da30ce19b7b", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bwa/sampe": { "branch": "master", "git_sha": "e0ff65e1fb313677de09f5f477ae3da30ce19b7b", - "installed_by": ["fastq_align_bwaaln"] + "installed_by": [ + "fastq_align_bwaaln" + ] }, "bwa/samse": { "branch": "master", "git_sha": "e0ff65e1fb313677de09f5f477ae3da30ce19b7b", - "installed_by": ["fastq_align_bwaaln"] + "installed_by": [ + "fastq_align_bwaaln" + ] }, "cat/cat": { "branch": "master", "git_sha": "81f27e75847087865299cc46605deb3b09b4e0a2", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "cat/fastq": { "branch": "master", "git_sha": "02fd5bd7275abad27aad32d5c852e0a9b1b98882", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "circularmapper/circulargenerator": { "branch": "master", "git_sha": "a7b0131370d9bc38076efad88773bca5537203d0", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "circularmapper/realignsamfile": { "branch": "master", "git_sha": "a7b0131370d9bc38076efad88773bca5537203d0", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "damageprofiler": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "dedup": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "eigenstratdatabasetools/eigenstratsnpcoverage": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "endorspy": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "falco": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "fastp": { "branch": "master", "git_sha": "003920c7f9a8ae19b69a97171922880220bedf56", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "fastqc": { "branch": "master", @@ -153,17 +209,23 @@ "freebayes": { "branch": "master", "git_sha": "77978839bef6d437f21edb900b49bcbc04f9f735", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "gatk/indelrealigner": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "gatk/realignertargetcreator": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "gatk/unifiedgenotyper": { "branch": "master", @@ -173,17 +235,23 @@ "gatk4/haplotypecaller": { "branch": "master", "git_sha": "d742e3143f2ccb8853c29b35cfcf50b5e5026980", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "gunzip": { "branch": "master", "git_sha": "3a5fef109d113b4997c9822198664ca5f2716208", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "kraken2/kraken2": { "branch": "master", "git_sha": "653218e79ffa76fde20319e9062f8b8da5cf7555", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "krakenuniq/preloadedkrakenuniq": { "branch": "master", @@ -193,12 +261,16 @@ "malt/run": { "branch": "master", "git_sha": "75027bf77472b1f4fd2cdd7e46f83119dfb0f2c6", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "maltextract": { "branch": "master", "git_sha": "8840ece9ee7528480dec95796e017be02ada0dc0", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "mapad/index": { "branch": "master", @@ -218,142 +290,213 @@ "megan/rma2info": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "metaphlan/metaphlan": { "branch": "master", "git_sha": "1038d3de36263159b4138324a646105941ac271a", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "mtnucratio": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "multiqc": { "branch": "master", "git_sha": "af27af1be706e6a2bb8fe454175b0cdf77f47b49", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] + }, + "multivcfanalyzer": { + "branch": "master", + "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc", + "installed_by": [ + "modules" + ] }, "picard/createsequencedictionary": { "branch": "master", "git_sha": "20b0918591d4ba20047d7e13e5094bcceba81447", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "picard/markduplicates": { "branch": "master", "git_sha": "ec833ac4c29db6005d18baccf3306f557c46b006", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "pmdtools/filter": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "preseq/ccurve": { "branch": "master", "git_sha": "575e1bc54b083fb15e7dd8b5fcc40bea60e8ce83", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "preseq/lcextrap": { "branch": "master", "git_sha": "9a88058962c0ee1715f2ad0e017c37e0cd75e532", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "prinseqplusplus": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "qualimap/bamqc": { "branch": "master", "git_sha": "6b0e4fe14ca1b12e131f64608f0bbaf36fd11451", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/collatefastq": { "branch": "master", "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/depth": { "branch": "master", "git_sha": "a1ffbc1fd87bd5a829e956cc26ec9cc53af3e817", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/faidx": { "branch": "master", "git_sha": "ce0b1aed7d504883061e748f492a31bf44c5777c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/fastq": { "branch": "master", "git_sha": "8d8f0ae52d6c9342bd41c33dda0b74b07e32153d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/flagstat": { "branch": "master", "git_sha": "8d8f0ae52d6c9342bd41c33dda0b74b07e32153d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/idxstats": { "branch": "master", "git_sha": "8d8f0ae52d6c9342bd41c33dda0b74b07e32153d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/index": { "branch": "master", "git_sha": "8d8f0ae52d6c9342bd41c33dda0b74b07e32153d", - "installed_by": ["bam_split_by_region", "fastq_align_bwaaln"] + "installed_by": [ + "bam_split_by_region", + "fastq_align_bwaaln" + ] }, "samtools/merge": { "branch": "master", "git_sha": "8d8f0ae52d6c9342bd41c33dda0b74b07e32153d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/mpileup": { "branch": "master", "git_sha": "ce0b1aed7d504883061e748f492a31bf44c5777c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/sort": { "branch": "master", "git_sha": "8d8f0ae52d6c9342bd41c33dda0b74b07e32153d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/view": { "branch": "master", "git_sha": "8d8f0ae52d6c9342bd41c33dda0b74b07e32153d", - "installed_by": ["bam_split_by_region"] + "installed_by": [ + "bam_split_by_region" + ] }, "seqkit/split2": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "sequencetools/pileupcaller": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "sexdeterrmine": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] + }, + "tabix/bgzip": { + "branch": "master", + "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", + "installed_by": [ + "modules" + ] }, "taxpasta/merge": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "taxpasta/standardise": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "untar": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] } } }, @@ -362,17 +505,23 @@ "bam_docounts_contamination_angsd": { "branch": "master", "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "bam_split_by_region": { "branch": "master", "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "fastq_align_bwaaln": { "branch": "master", "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nextflow_pipeline": { "branch": "master", @@ -393,4 +542,4 @@ } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/multivcfanalyzer/environment.yml b/modules/nf-core/multivcfanalyzer/environment.yml new file mode 100644 index 000000000..add0b883b --- /dev/null +++ b/modules/nf-core/multivcfanalyzer/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::multivcfanalyzer=0.85.2 diff --git a/modules/nf-core/multivcfanalyzer/main.nf b/modules/nf-core/multivcfanalyzer/main.nf new file mode 100644 index 000000000..f3a91a8a8 --- /dev/null +++ b/modules/nf-core/multivcfanalyzer/main.nf @@ -0,0 +1,100 @@ +process MULTIVCFANALYZER { + tag "$fasta" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multivcfanalyzer:0.85.2--hdfd78af_1': + 'biocontainers/multivcfanalyzer:0.85.2--hdfd78af_1' }" + + input: + tuple val(meta), path(vcfs) + tuple val(meta2), path(fasta) + tuple val(meta3), path(snpeff_results) + tuple val(meta4), path(gff) + val allele_freqs + val genotype_quality + val coverage + val homozygous_freq + val heterozygous_freq + tuple val(meta5), path(gff_exclude) + + + output: + tuple val(meta), path('fullAlignment.fasta.gz') , emit: full_alignment + tuple val(meta), path('info.txt') , emit: info_txt + tuple val(meta), path('snpAlignment.fasta.gz') , emit: snp_alignment + tuple val(meta), path('snpAlignmentIncludingRefGenome.fasta.gz') , emit: snp_genome_alignment + tuple val(meta), path('snpStatistics.tsv') , emit: snpstatistics + tuple val(meta), path('snpTable.tsv') , emit: snptable + tuple val(meta), path('snpTableForSnpEff.tsv') , emit: snptable_snpeff + tuple val(meta), path('snpTableWithUncertaintyCalls.tsv') , emit: snptable_uncertainty + tuple val(meta), path('structureGenotypes.tsv') , emit: structure_genotypes + tuple val(meta), path('structureGenotypes_noMissingData-Columns.tsv') , emit: structure_genotypes_nomissing + tuple val(meta), path('MultiVCFAnalyzer.json') , emit: json + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // def args = task.ext.args ?: '' // MultiVCFAnalyzer has strict and input ordering and all are mandatory. Deactivating $args to prevent breakage of input + def args2 = task.ext.args2 ?: '' + + def cmd_snpeff_results = snpeff_results ? "${snpeff_results}" : "NA" + def cmd_gff = gff ? "${gff}" : "NA" + def cmd_allele_freqs = allele_freqs ? "T" : "F" + def cmd_gff_exclude = gff_exclude ? "${gff}" : "NA" + + """ + multivcfanalyzer \\ + ${cmd_snpeff_results} \\ + ${fasta} \\ + ${cmd_gff} \\ + . \ + ${cmd_allele_freqs} \\ + ${genotype_quality} \\ + ${coverage} \\ + ${homozygous_freq} \\ + ${heterozygous_freq} \\ + ${cmd_gff_exclude} \\ + ${vcfs.sort().join(" ")} + + gzip \\ + $args2 \\ + fullAlignment.fasta snpAlignment.fasta snpAlignmentIncludingRefGenome.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + multivcfanalyzer: \$(echo \$(multivcfanalyzer --help | head -n 1) | cut -f 3 -d ' ' ) + END_VERSIONS + """ + stub: + + def args2 = task.ext.args2 ?: '' + + def cmd_snpeff_results = snpeff_results ? "${snpeff_results}" : "NA" + def cmd_gff = gff ? "${gff}" : "NA" + def cmd_allele_freqs = allele_freqs ? "T" : "F" + def cmd_gff_exclude = gff_exclude ? "${gff}" : "NA" + + """ + echo "" | gzip > fullAlignment.fasta.gz + touch info.txt + echo "" | gzip > snpAlignment.fasta.gz + echo "" | gzip > snpAlignmentIncludingRefGenome.fasta.gz + touch snpStatistics.tsv + touch snpTable.tsv + touch snpTableForSnpEff.tsv + touch snpTableWithUncertaintyCalls.tsv + touch structureGenotypes.tsv + touch structureGenotypes_noMissingData-Columns.tsv + touch MultiVCFAnalyzer.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + multivcfanalyzer: \$(echo \$(multivcfanalyzer --help | head -n 1) | cut -f 3 -d ' ' ) + END_VERSIONS + + """ +} diff --git a/modules/nf-core/multivcfanalyzer/meta.yml b/modules/nf-core/multivcfanalyzer/meta.yml new file mode 100644 index 000000000..6c1a8b6f0 --- /dev/null +++ b/modules/nf-core/multivcfanalyzer/meta.yml @@ -0,0 +1,204 @@ +name: "multivcfanalyzer" +description: SNP table generator from GATK UnifiedGenotyper with functionality geared + for aDNA +keywords: + - vcf + - ancient DNA + - aDNA + - SNP + - GATK UnifiedGenotyper + - SNP table +tools: + - "multivcfanalyzer": + description: "MultiVCFAnalyzer is a VCF file post-processing tool tailored for + aDNA. License on Github repository." + homepage: "https://github.com/alexherbig/MultiVCFAnalyzer" + documentation: "https://github.com/alexherbig/MultiVCFAnalyzer" + tool_dev_url: "https://github.com/alexherbig/MultiVCFAnalyzer" + doi: "10.1038/nature13591" + licence: ["GPL >=3"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - vcfs: + type: file + description: One or a list of uncompressed VCF file + pattern: "*.vcf" + - - meta2: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - fasta: + type: file + description: Reference genome VCF was generated against + pattern: "*.{fasta,fna,fa}" + - - meta3: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - snpeff_results: + type: file + description: Results from snpEff in txt format (Optional) + pattern: "*.txt" + - - meta4: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - gff: + type: file + description: GFF file corresponding to reference genome fasta (Optional) + pattern: "*.gff" + - - allele_freqs: + type: boolean + description: | + Whether to include the percentage of reads a given allele is + present in in the SNP table. + - - genotype_quality: + type: integer + description: | + Minimum GATK genotyping threshold threshold of which a SNP call + falling under is 'discarded' + - - coverage: + type: integer + description: | + Minimum number of a reads that a position must be covered by to be + reported + - - homozygous_freq: + type: integer + description: Fraction of reads a base must have to be called 'homozygous' + - - heterozygous_freq: + type: integer + description: | + Fraction of which whereby if a call falls above this value, and lower + than the homozygous threshold, a base will be called 'heterozygous'. + - - meta5: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - gff_exclude: + type: file + description: | + file listing positions that will be 'filtered' (i.e. ignored) + (Optional) + pattern: "*.vcf" +output: + - full_alignment: + - meta: + type: file + description: Fasta a fasta file of all positions contained in the VCF files + i.e. including ref calls + pattern: ".fasta.gz" + - fullAlignment.fasta.gz: + type: file + description: Fasta a fasta file of all positions contained in the VCF files + i.e. including ref calls + pattern: ".fasta.gz" + - info_txt: + - meta: + type: file + description: Information about the run + pattern: ".txt" + - info.txt: + type: file + description: Information about the run + pattern: ".txt" + - snp_alignment: + - meta: + type: file + description: A fasta file of just SNP positions with samples only + pattern: ".fasta.gz" + - snpAlignment.fasta.gz: + type: file + description: A fasta file of just SNP positions with samples only + pattern: ".fasta.gz" + - snp_genome_alignment: + - meta: + type: file + description: A fasta file of just SNP positions with reference genome + pattern: ".fasta.gz" + - snpAlignmentIncludingRefGenome.fasta.gz: + type: file + description: A fasta file of just SNP positions with reference genome + pattern: ".fasta.gz" + - snpstatistics: + - meta: + type: file + description: Some basic statistics about the SNP calls of each sample + pattern: ".tsv" + - snpStatistics.tsv: + type: file + description: Some basic statistics about the SNP calls of each sample + pattern: ".tsv" + - snptable: + - meta: + type: file + description: Basic SNP table of combined positions taken from each VCF file + pattern: ".tsv" + - snpTable.tsv: + type: file + description: Basic SNP table of combined positions taken from each VCF file + pattern: ".tsv" + - snptable_snpeff: + - meta: + type: file + description: Input file for SnpEff + pattern: ".tsv" + - snpTableForSnpEff.tsv: + type: file + description: Input file for SnpEff + pattern: ".tsv" + - snptable_uncertainty: + - meta: + type: file + description: Same as above, but with lower case characters indicating uncertain + calls + pattern: ".tsv" + - snpTableWithUncertaintyCalls.tsv: + type: file + description: Same as above, but with lower case characters indicating uncertain + calls + pattern: ".tsv" + - structure_genotypes: + - meta: + type: file + description: Input file for STRUCTURE + pattern: ".tsv" + - structureGenotypes.tsv: + type: file + description: Input file for STRUCTURE + pattern: ".tsv" + - structure_genotypes_nomissing: + - meta: + type: file + description: Alternate input file for STRUCTURE + pattern: ".tsv" + - structureGenotypes_noMissingData-Columns.tsv: + type: file + description: Alternate input file for STRUCTURE + pattern: ".tsv" + - json: + - meta: + type: file + description: Summary statistics in MultiQC JSON format + pattern: ".json" + - MultiVCFAnalyzer.json: + type: file + description: Summary statistics in MultiQC JSON format + pattern: ".json" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/multivcfanalyzer/tests/main.nf.test b/modules/nf-core/multivcfanalyzer/tests/main.nf.test new file mode 100644 index 000000000..cbf037bd0 --- /dev/null +++ b/modules/nf-core/multivcfanalyzer/tests/main.nf.test @@ -0,0 +1,168 @@ +nextflow_process { + + name "Test Process MULTIVCFANALYZER" + script "../main.nf" + process "MULTIVCFANALYZER" + + tag "modules" + tag "modules_nfcore" + tag "multivcfanalyzer" + tag "gunzip" + tag "gatk/unifiedgenotyper" + + test("sarscov2 - vcf") { + + setup { + run("GATK_UNIFIEDGENOTYPER") { + script "../../gatk/unifiedgenotyper/main.nf" + process{ + """ + input[0] = Channel.of([ [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExists: true) + ], + [ [ id:'test2' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.sorted.bam.bai', checkIfExists: true) + ] + ) + input[1] = [ [], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = [ [], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) + ] + input[3] = [ [], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.dict', checkIfExists: true) + ] + input[4] = [[],[]] + input[5] = [[],[]] + input[6] = [[],[]] + input[7] = [[],[]] + """ + } + } + + run("GUNZIP") { + script "../../gunzip/main.nf" + process{ + """ + input[0] = GATK_UNIFIEDGENOTYPER.out.vcf + """ + } + } + } + + when { + process { + """ + input[0] = GUNZIP.out.gunzip.collect{ meta, vcf -> vcf }.map{ vcf -> [[ id: 'testVCF'], vcf]} + input[1] = [ [] , + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = [[],[]] + input[3] = [[],[]] + input[4] = true + input[5] = 30 + input[6] = 5 + input[7] = 0.8 + input[8] = 0.2 + input[9] = [[],[]] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.snptable_snpeff, + process.out.json, + process.out.versions, + file(process.out.info_txt[0][1]).readLines().any { it.contains('Run finished:') }, + file(process.out.full_alignment[0][1]).readLines().any { it.contains('>') }, + file(process.out.snp_alignment[0][1]).readLines().any { it.contains('>') }, + file(process.out.snp_genome_alignment[0][1]).readLines().any { it.contains('>Reference') }, + file(process.out.snpstatistics[0][1]).readLines().any { it.contains('SNP statistics for') }, + file(process.out.snptable[0][1]).readLines().any { it.contains('Position') }, + file(process.out.snptable_uncertainty[0][1]).readLines().any { it.contains('Position') }, + file(process.out.structure_genotypes[0][1]).readLines().any { it.contains('-1') }, + file(process.out.structure_genotypes_nomissing[0][1]).readLines().any { it.contains('-1') }).match() + } + ) + } + } + + test("sarscov2 - vcf - stub") { + + options "-stub" + + setup { + run("GATK_UNIFIEDGENOTYPER", alias: "GATK_UNIFIEDGENOTYPER_STUB") { + script "../../gatk/unifiedgenotyper/main.nf" + + process{ + """ + input[0] = Channel.of([ [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExists: true) + ], + [ [ id:'test2' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.sorted.bam.bai', checkIfExists: true) + ] + ) + input[1] = [ [], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = [ [], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) + ] + input[3] = [ [], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.dict', checkIfExists: true) + ] + input[4] = [[],[]] + input[5] = [[],[]] + input[6] = [[],[]] + input[7] = [[],[]] + """ + } + } + + run("GUNZIP", alias: "GUNZIP_STUB") { + script "../../gunzip/main.nf" + process{ + """ + input[0] = GATK_UNIFIEDGENOTYPER_STUB.out.vcf + """ + } + } + } + + when { + process { + """ + input[0] = GUNZIP_STUB.out.gunzip.collect{ meta, vcf -> vcf }.map{ vcf -> [[ id: 'testVCF'], vcf]} + input[1] = [ [] , + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = [[],[]] + input[3] = [[],[]] + input[4] = true + input[5] = 30 + input[6] = 5 + input[7] = 0.8 + input[8] = 0.2 + input[9] = [[],[]] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/multivcfanalyzer/tests/main.nf.test.snap b/modules/nf-core/multivcfanalyzer/tests/main.nf.test.snap new file mode 100644 index 000000000..8745ba2ce --- /dev/null +++ b/modules/nf-core/multivcfanalyzer/tests/main.nf.test.snap @@ -0,0 +1,232 @@ +{ + "sarscov2 - vcf": { + "content": [ + [ + [ + { + "id": "testVCF" + }, + "snpTableForSnpEff.tsv:md5,8d7ab4ec98a89d290e301d6feae461aa" + ] + ], + [ + [ + { + "id": "testVCF" + }, + "MultiVCFAnalyzer.json:md5,c841c9f04c6114911f308ea09a08980e" + ] + ], + [ + "versions.yml:md5,f873a3a710189c18d247b603a6cbea1b" + ], + true, + true, + false, + false, + true, + true, + true, + true, + true + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.2" + }, + "timestamp": "2024-06-21T11:40:04.442639" + }, + "sarscov2 - vcf - stub": { + "content": [ + { + "0": [ + [ + { + "id": "testVCF" + }, + "fullAlignment.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + [ + { + "id": "testVCF" + }, + "info.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "10": [ + [ + { + "id": "testVCF" + }, + "MultiVCFAnalyzer.json:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "11": [ + "versions.yml:md5,f873a3a710189c18d247b603a6cbea1b" + ], + "2": [ + [ + { + "id": "testVCF" + }, + "snpAlignment.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "3": [ + [ + { + "id": "testVCF" + }, + "snpAlignmentIncludingRefGenome.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "4": [ + [ + { + "id": "testVCF" + }, + "snpStatistics.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "5": [ + [ + { + "id": "testVCF" + }, + "snpTable.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "6": [ + [ + { + "id": "testVCF" + }, + "snpTableForSnpEff.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "7": [ + [ + { + "id": "testVCF" + }, + "snpTableWithUncertaintyCalls.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "8": [ + [ + { + "id": "testVCF" + }, + "structureGenotypes.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "9": [ + [ + { + "id": "testVCF" + }, + "structureGenotypes_noMissingData-Columns.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "full_alignment": [ + [ + { + "id": "testVCF" + }, + "fullAlignment.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "info_txt": [ + [ + { + "id": "testVCF" + }, + "info.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "json": [ + [ + { + "id": "testVCF" + }, + "MultiVCFAnalyzer.json:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "snp_alignment": [ + [ + { + "id": "testVCF" + }, + "snpAlignment.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "snp_genome_alignment": [ + [ + { + "id": "testVCF" + }, + "snpAlignmentIncludingRefGenome.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "snpstatistics": [ + [ + { + "id": "testVCF" + }, + "snpStatistics.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "snptable": [ + [ + { + "id": "testVCF" + }, + "snpTable.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "snptable_snpeff": [ + [ + { + "id": "testVCF" + }, + "snpTableForSnpEff.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "snptable_uncertainty": [ + [ + { + "id": "testVCF" + }, + "snpTableWithUncertaintyCalls.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "structure_genotypes": [ + [ + { + "id": "testVCF" + }, + "structureGenotypes.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "structure_genotypes_nomissing": [ + [ + { + "id": "testVCF" + }, + "structureGenotypes_noMissingData-Columns.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,f873a3a710189c18d247b603a6cbea1b" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.2" + }, + "timestamp": "2024-12-16T12:13:06.080191151" + } +} \ No newline at end of file diff --git a/modules/nf-core/multivcfanalyzer/tests/tags.yml b/modules/nf-core/multivcfanalyzer/tests/tags.yml new file mode 100644 index 000000000..3ff5245f2 --- /dev/null +++ b/modules/nf-core/multivcfanalyzer/tests/tags.yml @@ -0,0 +1,2 @@ +multivcfanalyzer: + - "modules/nf-core/multivcfanalyzer/**" diff --git a/modules/nf-core/tabix/bgzip/environment.yml b/modules/nf-core/tabix/bgzip/environment.yml new file mode 100644 index 000000000..6221bb53a --- /dev/null +++ b/modules/nf-core/tabix/bgzip/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda + +dependencies: + - bioconda::htslib=1.21 + - bioconda::tabix=1.11 diff --git a/modules/nf-core/tabix/bgzip/main.nf b/modules/nf-core/tabix/bgzip/main.nf new file mode 100644 index 000000000..c7e7462f1 --- /dev/null +++ b/modules/nf-core/tabix/bgzip/main.nf @@ -0,0 +1,56 @@ +process TABIX_BGZIP { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/92/92859404d861ae01afb87e2b789aebc71c0ab546397af890c7df74e4ee22c8dd/data' : + 'community.wave.seqera.io/library/htslib:1.21--ff8e28a189fbecaa' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("${output}"), emit: output + tuple val(meta), path("*.gzi") , emit: gzi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + in_bgzip = ["gz", "bgz", "bgzf"].contains(input.getExtension()) + extension = in_bgzip ? input.getBaseName().tokenize(".")[-1] : input.getExtension() + output = in_bgzip ? "${prefix}.${extension}" : "${prefix}.${extension}.gz" + command = in_bgzip ? '-d' : '' + // Name the index according to $prefix, unless a name has been requested + split_args = args.split(' +|=') + if ((split_args.contains('-i') || split_args.contains('--index')) && !split_args.contains('-I') && !split_args.contains('--index-name')) { + args = args + " -I ${output}.gzi" + } + """ + bgzip $command -c $args -@${task.cpus} $input > ${output} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + in_bgzip = ["gz", "bgz", "bgzf"].contains(input.getExtension()) + output = in_bgzip ? input.getBaseName() : "${prefix}.${input.getExtension()}.gz" + + """ + echo "" | gzip > ${output} + touch ${output}.gzi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/tabix/bgzip/meta.yml b/modules/nf-core/tabix/bgzip/meta.yml new file mode 100644 index 000000000..7e44a0770 --- /dev/null +++ b/modules/nf-core/tabix/bgzip/meta.yml @@ -0,0 +1,61 @@ +name: tabix_bgzip +description: Compresses/decompresses files +keywords: + - compress + - decompress + - bgzip + - tabix +tools: + - bgzip: + description: | + Bgzip compresses or decompresses files in a similar manner to, and compatible with, gzip. + homepage: https://www.htslib.org/doc/tabix.html + documentation: http://www.htslib.org/doc/bgzip.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] + identifier: biotools:tabix +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: file to compress or to decompress +output: + - output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${output}: + type: file + description: Output compressed/decompressed file + pattern: "*." + - gzi: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - '*.gzi': + type: file + description: Optional gzip index file for compressed inputs + pattern: "*.gzi" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@maxulysse" + - "@nvnieuwk" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@maxulysse" + - "@nvnieuwk" diff --git a/modules/nf-core/tabix/bgzip/tests/bgzip_compress.config b/modules/nf-core/tabix/bgzip/tests/bgzip_compress.config new file mode 100644 index 000000000..6b6ff55fe --- /dev/null +++ b/modules/nf-core/tabix/bgzip/tests/bgzip_compress.config @@ -0,0 +1,5 @@ +process { + withName: TABIX_BGZIP { + ext.args = ' -i' + } +} diff --git a/modules/nf-core/tabix/bgzip/tests/main.nf.test b/modules/nf-core/tabix/bgzip/tests/main.nf.test new file mode 100644 index 000000000..00e7c0984 --- /dev/null +++ b/modules/nf-core/tabix/bgzip/tests/main.nf.test @@ -0,0 +1,111 @@ +nextflow_process { + + name "Test Process TABIX_BGZIP" + script "../main.nf" + process "TABIX_BGZIP" + + tag "modules" + tag "modules_nfcore" + tag "tabix" + tag "tabix/bgzip" + + test("sarscov2_vcf_bgzip_compress") { + when { + process { + """ + input[0] = [ + [ id:'bgzip_test' ], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert snapshot( + file(process.out.output[0][1]).name + ).match("bgzip_test") + } + ) + } + } + + test("homo_genome_bedgz_compress") { + when { + process { + """ + input[0] = [ + [ id:'bedgz_test' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.bed.gz', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert snapshot( + file(process.out.output[0][1]).name + ).match("bedgz_test") + } + ) + } + } + + test("sarscov2_vcf_bgzip_compress_stub") { + options '-stub' + config "./bgzip_compress.config" + + when { + process { + """ + input[0] = [ + [ id:"test_stub" ], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert snapshot( + file(process.out.output[0][1]).name + ).match("test_stub") + } + ) + } + } + + test("sarscov2_vcf_bgzip_compress_gzi") { + config "./bgzip_compress.config" + when { + process { + """ + input[0] = [ + [ id:"gzi_compress_test" ], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert snapshot( + file(process.out.gzi[0][1]).name + ).match("gzi_compress_test") + } + ) + } + } +} diff --git a/modules/nf-core/tabix/bgzip/tests/main.nf.test.snap b/modules/nf-core/tabix/bgzip/tests/main.nf.test.snap new file mode 100644 index 000000000..c605d54a2 --- /dev/null +++ b/modules/nf-core/tabix/bgzip/tests/main.nf.test.snap @@ -0,0 +1,218 @@ +{ + "gzi_compress_test": { + "content": [ + "gzi_compress_test.vcf.gz.gzi" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-02-19T14:52:29.328146" + }, + "homo_genome_bedgz_compress": { + "content": [ + { + "0": [ + [ + { + "id": "bedgz_test" + }, + "bedgz_test.bed:md5,87a15eb9c2ff20ccd5cd8735a28708f7" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,8721da9158d25c69b2215adf9cdc9fde" + ], + "gzi": [ + + ], + "output": [ + [ + { + "id": "bedgz_test" + }, + "bedgz_test.bed:md5,87a15eb9c2ff20ccd5cd8735a28708f7" + ] + ], + "versions": [ + "versions.yml:md5,8721da9158d25c69b2215adf9cdc9fde" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-03-26T13:52:19.285035543" + }, + "test_stub": { + "content": [ + "test_stub.vcf.gz" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-02-19T14:52:20.811489" + }, + "sarscov2_vcf_bgzip_compress": { + "content": [ + { + "0": [ + [ + { + "id": "bgzip_test" + }, + "bgzip_test.vcf.gz:md5,8e722884ffb75155212a3fc053918766" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,8721da9158d25c69b2215adf9cdc9fde" + ], + "gzi": [ + + ], + "output": [ + [ + { + "id": "bgzip_test" + }, + "bgzip_test.vcf.gz:md5,8e722884ffb75155212a3fc053918766" + ] + ], + "versions": [ + "versions.yml:md5,8721da9158d25c69b2215adf9cdc9fde" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-03-26T13:52:15.290470496" + }, + "sarscov2_vcf_bgzip_compress_gzi": { + "content": [ + { + "0": [ + [ + { + "id": "gzi_compress_test" + }, + "gzi_compress_test.vcf.gz:md5,8e722884ffb75155212a3fc053918766" + ] + ], + "1": [ + [ + { + "id": "gzi_compress_test" + }, + "gzi_compress_test.vcf.gz.gzi:md5,26fd00d4e26141cd11561f6e7d4a2ad0" + ] + ], + "2": [ + "versions.yml:md5,8721da9158d25c69b2215adf9cdc9fde" + ], + "gzi": [ + [ + { + "id": "gzi_compress_test" + }, + "gzi_compress_test.vcf.gz.gzi:md5,26fd00d4e26141cd11561f6e7d4a2ad0" + ] + ], + "output": [ + [ + { + "id": "gzi_compress_test" + }, + "gzi_compress_test.vcf.gz:md5,8e722884ffb75155212a3fc053918766" + ] + ], + "versions": [ + "versions.yml:md5,8721da9158d25c69b2215adf9cdc9fde" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-03-26T13:52:26.577148245" + }, + "bgzip_test": { + "content": [ + "bgzip_test.vcf.gz" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-02-19T14:52:03.768295" + }, + "bedgz_test": { + "content": [ + "bedgz_test.bed" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-02-19T14:52:12.453855" + }, + "sarscov2_vcf_bgzip_compress_stub": { + "content": [ + { + "0": [ + [ + { + "id": "test_stub" + }, + "test_stub.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + [ + { + "id": "test_stub" + }, + "test_stub.vcf.gz.gzi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,8721da9158d25c69b2215adf9cdc9fde" + ], + "gzi": [ + [ + { + "id": "test_stub" + }, + "test_stub.vcf.gz.gzi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "output": [ + [ + { + "id": "test_stub" + }, + "test_stub.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,8721da9158d25c69b2215adf9cdc9fde" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-03-26T13:52:22.850987971" + } +} \ No newline at end of file diff --git a/modules/nf-core/tabix/bgzip/tests/vcf_none.config b/modules/nf-core/tabix/bgzip/tests/vcf_none.config new file mode 100644 index 000000000..f3a3c467d --- /dev/null +++ b/modules/nf-core/tabix/bgzip/tests/vcf_none.config @@ -0,0 +1,5 @@ +process { + withName: TABIX_BGZIP { + ext.args = '' + } +} diff --git a/nextflow.config b/nextflow.config index 91203cb88..6f408392f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -286,6 +286,19 @@ params { genotyping_freebayes_skip_coverage = 0 genotyping_angsd_glmodel = 'samtools' genotyping_angsd_glformat = 'binary' + + //Consensus sequence + run_consensus_sequence = false + consensus_tool = null + consensus_multivcfanalyzer_write_allele_frequencies = false + consensus_multivcfanalyzer_min_genotype_quality = 30 + consensus_multivcfanalyzer_min_base_coverage = 5 + consensus_multivcfanalyzer_allele_freq_hom = 0.9 + consensus_multivcfanalyzer_allele_freq_het = 0.9 + consensus_multivcfanalyzer_additional_vcf_files = null + consensus_multivcfanalyzer_reference_gff_annotations = null + consensus_multivcfanalyzer_reference_gff_exclude = null + consensus_multivcfanalyzer_snpeff_results = null } // Load base.config by default for all pipelines diff --git a/nextflow_schema.json b/nextflow_schema.json index a72b94ef9..165e66901 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -346,7 +346,7 @@ "type": "boolean", "description": "Specify to skip read-pair merging.", "fa_icon": "fas fa-forward", - "help_text": "Turns off the paired-end read merging, and will result in paired-end mapping modes being used during reference of reads again alignment.\n\nThis can be useful in cases where you have long ancient DNA reads, modern DNA or when you want to utilise mate-pair 'spatial' information.\n\n ⚠️ If you run this with --preprocessing_minlength set to a value (as is by default!), you may end up removing single reads from either the pair1 or pair2 file. These reads will be NOT be mapped when aligning with either BWA or bowtie, as both can only accept one (forward) or two (forward and reverse) FASTQs as input in paired-end mode.\n\n> ⚠️ If you run metagenomic screening as well as skipping merging, all reads will be screened as independent reads - not as pairs! - as all FASTQ files from BAM filtering are merged into one. This merged file is _not_ saved in results directory.\n\n> Modifies AdapterRemoval parameter: `--collapse`\n> Modifies fastp parameter: `--merge`" + "help_text": "Turns off the paired-end read merging, and will result in paired-end mapping modes being used during reference of reads again alignment.\n\nThis can be useful in cases where you have long ancient DNA reads, modern DNA or when you want to utilise mate-pair 'spatial' information.\n\n \u26a0\ufe0f If you run this with --preprocessing_minlength set to a value (as is by default!), you may end up removing single reads from either the pair1 or pair2 file. These reads will be NOT be mapped when aligning with either BWA or bowtie, as both can only accept one (forward) or two (forward and reverse) FASTQs as input in paired-end mode.\n\n> \u26a0\ufe0f If you run metagenomic screening as well as skipping merging, all reads will be screened as independent reads - not as pairs! - as all FASTQ files from BAM filtering are merged into one. This merged file is _not_ saved in results directory.\n\n> Modifies AdapterRemoval parameter: `--collapse`\n> Modifies fastp parameter: `--merge`" }, "preprocessing_excludeunmerged": { "type": "boolean", @@ -389,7 +389,7 @@ "type": "integer", "default": 0, "description": "Specify number of bases to hard-trim from 5 prime or front of reads.", - "help_text": "Specify number of bases to hard-trim from 5 prime or front of reads. Exact behaviour varies per tool, see documentation. By default set to `0` to not perform any hard trimming.\n\nThis parameter allows users to 'hard' remove a number of bases from the beginning or end of reads, regardless of quality.\n\n ⚠️ When this trimming occurs depends on the tool, i.e., the exact behaviour is not the same between AdapterRemoval and fastp.\n\nFor fastp: 5p/3p trimming occurs _prior_ to any other trimming (quality, poly-G, adapter). Please see the [fastp documentation](https://github.com/OpenGene/fastp#global-trimming) for more information. If you wish to use this to remove damage prior to mapping (to allow more specific mapping), ensure you have manually removed adapters/quality trimmed **prior** to giving the reads to nf-core/eager. Alternatively, you can use Bowtie 2's inbuilt pre-mapping read-end trimming functionality. Note that nf-core/eager only allows this hard trimming equally for both forward and reverse reads (i.e., you cannot provide different values for the 5p end for R1 and R2).\n\nFor AdapterRemoval, this trimming happens _after_ the removal of adapters, however prior to quality trimming. Therefore, this is more suitable for hard-removal of damage before mapping (however the Bowtie 2 system will be more reliable).\n\n> Modifies AdapterRemoval parameters: `--trim5p`\n> Modifies fastp parameters: `--trim_front1` and/or `--trim_front2`", + "help_text": "Specify number of bases to hard-trim from 5 prime or front of reads. Exact behaviour varies per tool, see documentation. By default set to `0` to not perform any hard trimming.\n\nThis parameter allows users to 'hard' remove a number of bases from the beginning or end of reads, regardless of quality.\n\n \u26a0\ufe0f When this trimming occurs depends on the tool, i.e., the exact behaviour is not the same between AdapterRemoval and fastp.\n\nFor fastp: 5p/3p trimming occurs _prior_ to any other trimming (quality, poly-G, adapter). Please see the [fastp documentation](https://github.com/OpenGene/fastp#global-trimming) for more information. If you wish to use this to remove damage prior to mapping (to allow more specific mapping), ensure you have manually removed adapters/quality trimmed **prior** to giving the reads to nf-core/eager. Alternatively, you can use Bowtie 2's inbuilt pre-mapping read-end trimming functionality. Note that nf-core/eager only allows this hard trimming equally for both forward and reverse reads (i.e., you cannot provide different values for the 5p end for R1 and R2).\n\nFor AdapterRemoval, this trimming happens _after_ the removal of adapters, however prior to quality trimming. Therefore, this is more suitable for hard-removal of damage before mapping (however the Bowtie 2 system will be more reliable).\n\n> Modifies AdapterRemoval parameters: `--trim5p`\n> Modifies fastp parameters: `--trim_front1` and/or `--trim_front2`", "fa_icon": "fas fa-cut" }, "preprocessing_trim3p": { @@ -397,7 +397,7 @@ "default": 0, "description": "Specify number of bases to hard-trim from 3 prime or tail of reads.", "fa_icon": "fas fa-cut", - "help_text": "Specify number of bases to hard-trim from 3 prime or tail of reads. Exact behaviour varies per tool, see documentation. By default set to `0` to not perform any hard trimming.\n\nThis parameter allows users to 'hard' remove a number of bases from the beginning or end of reads, regardless of quality.\n\n⚠️ When this trimming occurs depends on the tool, i.e., the exact behaviour is not the same between AdapterRemoval and fastp.\n\nFor fastp: 5p/3p trimming occurs _prior_ to any other trimming (quality, poly-G, adapter). Please see the [fastp documentation](https://github.com/OpenGene/fastp#global-trimming) for more information. If you wish to use this to remove damage prior to mapping (to allow more specific mapping), ensure you have manually removed adapters/quality trimmed **prior** to giving the reads to nf-core/eager. Alternatively, you can use Bowtie 2's inbuilt pre-mapping read-end trimming functionality. Note that nf-core/eager only allows this hard trimming equally for both forward and reverse reads (i.e., you cannot provide different values for the 3p end for R1 and R2).\n\nFor AdapterRemoval, this trimming happens _after_ the removal of adapters, however prior to quality trimming. Therefore this is more suitable for hard-removal of damage before mapping (however the Bowtie 2 system will be more reliable).\n\n> Modifies AdapterRemoval parameters: `--trim3p`\n> Modifies fastp parameters: `--trim_tail1` and/or `--trim_tail2`" + "help_text": "Specify number of bases to hard-trim from 3 prime or tail of reads. Exact behaviour varies per tool, see documentation. By default set to `0` to not perform any hard trimming.\n\nThis parameter allows users to 'hard' remove a number of bases from the beginning or end of reads, regardless of quality.\n\n\u26a0\ufe0f When this trimming occurs depends on the tool, i.e., the exact behaviour is not the same between AdapterRemoval and fastp.\n\nFor fastp: 5p/3p trimming occurs _prior_ to any other trimming (quality, poly-G, adapter). Please see the [fastp documentation](https://github.com/OpenGene/fastp#global-trimming) for more information. If you wish to use this to remove damage prior to mapping (to allow more specific mapping), ensure you have manually removed adapters/quality trimmed **prior** to giving the reads to nf-core/eager. Alternatively, you can use Bowtie 2's inbuilt pre-mapping read-end trimming functionality. Note that nf-core/eager only allows this hard trimming equally for both forward and reverse reads (i.e., you cannot provide different values for the 3p end for R1 and R2).\n\nFor AdapterRemoval, this trimming happens _after_ the removal of adapters, however prior to quality trimming. Therefore this is more suitable for hard-removal of damage before mapping (however the Bowtie 2 system will be more reliable).\n\n> Modifies AdapterRemoval parameters: `--trim3p`\n> Modifies fastp parameters: `--trim_tail1` and/or `--trim_tail2`" }, "preprocessing_savepreprocessedreads": { "type": "boolean", @@ -702,12 +702,12 @@ "default": 4, "fa_icon": "fas fa-flag", "description": "Specify the SAM format flag of reads to remove during BAM filtering for downstream genomic steps.", - "help_text": "Specify to customise the exact SAM format flag of reads you wish to _remove_ from your BAM file to for downstream _genomic_ analyses.\n\nYou can explore more using a tool from the Broad Institute [here](https://broadinstitute.github.io/picard/explain-flags.html)\n\n> ⚠️ Modify at your own risk, alternative flags are not necessarily supported in downstream steps!\n\n> Modifies samtools parameter: `-F`" + "help_text": "Specify to customise the exact SAM format flag of reads you wish to _remove_ from your BAM file to for downstream _genomic_ analyses.\n\nYou can explore more using a tool from the Broad Institute [here](https://broadinstitute.github.io/picard/explain-flags.html)\n\n> \u26a0\ufe0f Modify at your own risk, alternative flags are not necessarily supported in downstream steps!\n\n> Modifies samtools parameter: `-F`" }, "bamfiltering_retainunmappedgenomicbam": { "type": "boolean", "description": "Specify to retain unmapped reads in the BAM file used for downstream genomic analyses.", - "help_text": "Specify to retain unmapped reads (optionally also length filtered) in the genomic BAM for downstream analysis. By default, the pipeline only keeps mapped reads for downstream analysis.\n\nThis is also turned on if `--metagenomics_input` is set to `all`.\n\n> ⚠️ This will likely slow down run time of downstream pipeline steps!\n\n> Modifies tool parameter(s):\n> - samtools view: `-f 4` / `-F 4`", + "help_text": "Specify to retain unmapped reads (optionally also length filtered) in the genomic BAM for downstream analysis. By default, the pipeline only keeps mapped reads for downstream analysis.\n\nThis is also turned on if `--metagenomics_input` is set to `all`.\n\n> \u26a0\ufe0f This will likely slow down run time of downstream pipeline steps!\n\n> Modifies tool parameter(s):\n> - samtools view: `-f 4` / `-F 4`", "fa_icon": "fas fa-piggy-bank" }, "bamfiltering_generatefastq": { @@ -921,14 +921,12 @@ "metagenomics_maltextract_taxonlist": { "type": "string", "description": "Path to a text file with taxa of interest (one taxon per row, NCBI taxonomy name format)", - "default": null, "help_text": "Path to a `.txt` file with taxa of interest you wish to assess for aDNA characteristics. In `.txt` file should be one taxon per row, and the taxon should be in a valid [NCBI taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy) name format corresponding to a taxonomic node in your MALT database. An example can be found on the [HOPS github](https://raw.githubusercontent.com/rhuebler/HOPS/external/Resources/default_list.txt).\\n\\nNecessary when `--metagenomics_profiling_tool malt` specified and `--metagenomics_run_postprocessing` flagged.\n\n Modifies tool parameter(s):\n> - MaltExtract: `-t`", "fa_icon": "fas fa-align-left" }, "metagenomics_maltextract_ncbidir": { "type": "string", "description": "Path to directory containing containing NCBI resource files (ncbi.tre and ncbi.map; available: https://github.com/rhuebler/HOPS/)", - "default": null, "help_text": "Path to directory containing containing the NCBI resource tree and taxonomy table files (ncbi.tre and ncbi.map; available at the [HOPS repository](https://github.com/rhuebler/HOPS/Resources)).\\n\\nNecessary when `--metagenomics_profiling_tool malt` and `--metagenomics_run_postprocessing` specified.\n\n Modifies tool parameter(s):\n> - MaltExtract: `-r`", "fa_icon": "fab fa-buffer" }, @@ -1006,7 +1004,7 @@ "type": "string", "default": "markduplicates", "description": "Specify which tool to use for deduplication.", - "help_text": "Specify which duplicate read removal tool to use. While `markduplicates` is set by default, an ancient DNA specific read deduplication tool `dedup` is offered (see [Peltzer et al. 2016](https://doi.org/10.1186/s13059-016-0918-z) for details). The latter utilises both ends of paired-end data to remove duplicates (i.e. true exact duplicates, as markduplicates will over-zealously deduplicate anything with the same starting position even if the ends are different).\n\n> ⚠️ DeDup can only be used on collapsed (i.e. merged) reads from paired-end sequencing.", + "help_text": "Specify which duplicate read removal tool to use. While `markduplicates` is set by default, an ancient DNA specific read deduplication tool `dedup` is offered (see [Peltzer et al. 2016](https://doi.org/10.1186/s13059-016-0918-z) for details). The latter utilises both ends of paired-end data to remove duplicates (i.e. true exact duplicates, as markduplicates will over-zealously deduplicate anything with the same starting position even if the ends are different).\n\n> \u26a0\ufe0f DeDup can only be used on collapsed (i.e. merged) reads from paired-end sequencing.", "enum": ["markduplicates", "dedup"], "fa_icon": "fas fa-hammer" } @@ -1024,7 +1022,7 @@ "type": "boolean", "fa_icon": "fas fa-power-off", "description": "Specify to turn on damage rescaling of BAM files using mapDamage2 to probabilistically remove damage.", - "help_text": "Specify to turn on mapDamage2's BAM rescaling functionality. This probabilistically replaces Ts back to Cs depending on the likelihood this reference-mismatch was originally caused by damage. If the library is specified to be single-stranded, this will automatically use the `--single-stranded` mode.\nThis process will ameliorate the effects of aDNA damage, but also increase reference-bias.\n\n**This functionality does not have any MultiQC output.**\n ⚠️ Rescaled libraries will not be merged with non-scaled libraries of the same sample for downstream genotyping, as the model may be different for each library. If you wish to merge these, please do this manually and re-run nf-core/eager using the merged BAMs as input.\n\n> Modifies mapDamage2 parameter: `--rescale`" + "help_text": "Specify to turn on mapDamage2's BAM rescaling functionality. This probabilistically replaces Ts back to Cs depending on the likelihood this reference-mismatch was originally caused by damage. If the library is specified to be single-stranded, this will automatically use the `--single-stranded` mode.\nThis process will ameliorate the effects of aDNA damage, but also increase reference-bias.\n\n**This functionality does not have any MultiQC output.**\n \u26a0\ufe0f Rescaled libraries will not be merged with non-scaled libraries of the same sample for downstream genotyping, as the model may be different for each library. If you wish to merge these, please do this manually and re-run nf-core/eager using the merged BAMs as input.\n\n> Modifies mapDamage2 parameter: `--rescale`" }, "damage_manipulation_rescale_seqlength": { "type": "integer", @@ -1602,6 +1600,87 @@ }, "fa_icon": "fas fa-transgender-alt", "help_text": "" + }, + "consensus_sequence": { + "title": "Consensus sequence", + "type": "object", + "description": "Options to create a consensus sequence from a VCF file.", + "default": "", + "fa_icon": "far fa-handshake", + "properties": { + "run_consensus_sequence": { + "type": "boolean", + "description": "Specify to create consensus sequence from VCF files", + "fa_icon": "fas fa-power-off", + "help_text": "Specify to create consensus sequence from VCF files." + }, + "consensus_tool": { + "type": "string", + "fa_icon": "fas fa-hammer", + "description": "Speficy which tool to use to create consensus sequences: multivcfanalyzer, gencons", + "enum": ["multivcfanalyzer", "gencons"], + "help_text": "Specify the tool to create the consensus sequence. Accepted values: multivcfanalyzer, gencons.\n\nNote: These tools currently only supports diploid GATK UnifiedGenotyper vcfs as input." + }, + "consensus_multivcfanalyzer_write_allele_frequencies": { + "type": "boolean", + "fa_icon": "fas fa-edit", + "description": "Speficy to write the allele frequencies in the SNP table", + "help_text": "Turn on writing write allele frequencies in the SNP table produced by MultiVCFAnalyzer." + }, + "consensus_multivcfanalyzer_min_genotype_quality": { + "type": "integer", + "default": 30, + "fa_icon": "fas fa-medal", + "description": "Specify the mininum genotyping quality for base calling.", + "help_text": "Specify the minimum genotyping quality threshold for a position to be considered for base calling." + }, + "consensus_multivcfanalyzer_min_base_coverage": { + "type": "integer", + "default": 5, + "fa_icon": "fas fa-sort-amount-up", + "description": "Specify the minum number of reads required for base calling", + "help_text": "Specify the minimum number of reads covering a position for it to be considered for base calling." + }, + "consensus_multivcfanalyzer_allele_freq_hom": { + "type": "number", + "default": 0.9, + "fa_icon": "fas fa-percent", + "description": "Specify the minimum allele frequency required for a base to be called as 'homozygous'", + "help_text": "Specify the minimum allele frequency that a base requires to be considered a 'homozygous' call." + }, + "consensus_multivcfanalyzer_allele_freq_het": { + "type": "number", + "default": 0.9, + "description": "Specify the minimum allele frequency required for a base to be called as 'heterozygous'", + "help_text": "Specify the minimum allele frequency that a base requires to be considered a 'heterozygous' call.", + "fa_icon": "fas fa-percent" + }, + "consensus_multivcfanalyzer_additional_vcf_files": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "Specify paths do additional VCF files", + "help_text": "Specify paths to additional pre-made VCF files to be included in the SNP table generation. Use wildcard(s) for multiple files e.g. `/path/to/*.vcf.gz`. Optional." + }, + "consensus_multivcfanalyzer_reference_gff_annotations": { + "type": "string", + "fa_icon": "fas fa-asterisk", + "description": "Specify path to GFF reference file", + "help_text": "Specify path to the reference genome annotations in '.gff' format. Optional." + }, + "consensus_multivcfanalyzer_reference_gff_exclude": { + "type": "string", + "description": "Specify path to GFF with positions to be excluded", + "help_text": "Specify path to the positions to be excluded in '.gff' format. Optional.", + "fa_icon": "fas fa-ban" + }, + "consensus_multivcfanalyzer_snpeff_results": { + "type": "string", + "fa_icon": "fas fa-poll-h", + "description": "Specify path to SNP effect output", + "help_text": "Specify path to the output file from SNPEff containing SNP effects in '.txt' format. Optional." + } + }, + "help_text": "" } }, "allOf": [ @@ -1658,6 +1737,9 @@ }, { "$ref": "#/$defs/human_sex_determination" + }, + { + "$ref": "#/$defs/consensus_sequence" } ] } diff --git a/subworkflows/local/consensus_sequence.nf b/subworkflows/local/consensus_sequence.nf new file mode 100644 index 000000000..0e203a453 --- /dev/null +++ b/subworkflows/local/consensus_sequence.nf @@ -0,0 +1,96 @@ +// +// Produce consensus sequences from reads aligned to a reference some targetted to aDNA +// + +include { MULTIVCFANALYZER } from '../../modules/nf-core/multivcfanalyzer' +include { addNewMetaFromAttributes } from '../../subworkflows/local/utils_nfcore_eager_pipeline/main' +include { TABIX_BGZIP as UG_BGZIP } from '../../modules/nf-core/tabix/bgzip' +include { GUNZIP as REF_MVA_GUNZIP } from '../../modules/nf-core/gunzip' + +workflow CONSENSUS_SEQUENCE { + take: + ch_genotypes_vcf + ch_samplesheet_vcfs // [meta, additional_vcf] + ch_mva_files // [meta, reference_gff, reference_gff_exclude, ] + ch_fasta // [ meta, fasta ] + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + if (params.consensus_tool == 'multivcfanalyzer') { + + write_allele_frequencies = params.consensus_multivcfanalyzer_write_allele_frequencies ? "T" : "F" + + ch_genotypes_unzip = ch_genotypes_vcf.map { meta, vcfs, vcf_index -> + [meta, vcfs] + } + ch_genotypes_vcf_final = UG_BGZIP(ch_genotypes_unzip).output + .map { + addNewMetaFromAttributes(it, "reference", "reference", false) + } + .groupTuple() + .map { metaref, meta, vcfs -> + [metaref, vcfs] + } + .dump(tag: "consensus_genotyped_vcfs") + ch_additional_vcfs = REF_MVA_GUNZIP(ch_samplesheet_vcfs).gunzip + .dump(tag: "additional_vcfs") + + ch_fasta_final = ch_fasta + .map { meta, fasta, fai, dict, mapindex -> + def new_meta = meta.subMap(['id']) + [[reference: new_meta.id], fasta] + } + .dump(tag: "consensus_fasta") + + ch_mva_input = ch_mva_files + .dump(tag: "consensus_ref_related_files") + .map { meta, reference_gff, reference_gff_exclude, reference_snpeff_results -> + def new_meta = meta.subMap(['id']) + [[reference: new_meta.id], reference_gff, reference_gff_exclude, reference_snpeff_results] + } + .join(ch_genotypes_vcf_final) + .dump(tag: "consensus_postjoin") + .join(ch_additional_vcfs) + .join(ch_fasta_final) + .dump(tag: "consensus_postjoin2") +// .multiMap { meta, reference_gff, reference_gff_exclude, reference_snpeff_results, ug_vcfs, additional_vcf, fasta -> +// vcfs: [meta, additional_vcf + ug_vcfs] +// reference_gff: [meta, reference_gff ?: []] +// reference_gff_exclude: [meta, reference_gff_exclude ?: []] +// reference_snpeff_results: [meta, reference_snpeff_results ?: []] +// reference_fasta: [meta, fasta] +// } +// .dump(tag: "consensus_sequence_final") + +// MULTIVCFANALYZER( +// ch_mva_input.vcfs, +// ch_mva_input.reference_fasta, +// ch_mva_input.reference_snpeff_results, +// ch_mva_input.reference_gff, +// write_allele_frequencies, +// params.consensus_multivcfanalyzer_min_genotype_quality, +// params.consensus_multivcfanalyzer_min_base_coverage, +// params.consensus_multivcfanalyzer_allele_freq_hom, +// params.consensus_multivcfanalyzer_allele_freq_het, +// ch_mva_input.reference_gff_exclude, +// ) + +// ch_full_alignment_mva = MULTIVCFANALYZER.out.full_alignment +// ch_info_mva = MULTIVCFANALYZER.out.info_txt +// ch_snp_alignment_mva = MULTIVCFANALYZER.out.snp_alignment +// ch_snp_genome_alignment_mva = MULTIVCFANALYZER.out.snp_genome_alignment +// ch_snp_statistics_mva = MULTIVCFANALYZER.out.snpstatistics +// ch_snp_table_mva = MULTIVCFANALYZER.out.snptable +// ch_snp_table_snpeff_mva = MULTIVCFANALYZER.out.snptable_snpeff +// ch_snp_table_uncertainty_mva = MULTIVCFANALYZER.out.snptable_uncertainty +// ch_structure_genotypes_mva = MULTIVCFANALYZER.out.structure_genotypes +// ch_structure_genotypes_nomissing_mva = MULTIVCFANALYZER.out.structure_genotypes_nomissing +// ch_versions = ch_versions.mix(MULTIVCFANALYZER.out.versions) +// ch_multiqc_files = ch_multiqc_files.mix(MULTIVCFANALYZER.out.json) + } + + emit: + versions = ch_versions // channel: path(versions.yml) + mqc = ch_multiqc_files // channel: [ val(meta), path("*.json") ] +} diff --git a/subworkflows/local/preprocessing_fastp.nf b/subworkflows/local/preprocessing_fastp.nf index 780fe84e9..c73866262 100644 --- a/subworkflows/local/preprocessing_fastp.nf +++ b/subworkflows/local/preprocessing_fastp.nf @@ -2,8 +2,8 @@ // Process short raw reads with FastP // -include { FASTP as FASTP_SINGLE } from '../../modules/nf-core/fastp/main' -include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/fastp/main' +include { FASTP as FASTP_SINGLE } from '../../modules/nf-core/fastp/main' +include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/fastp/main' workflow PREPROCESSING_FASTP { take: @@ -11,42 +11,38 @@ workflow PREPROCESSING_FASTP { adapterlist // .fasta main: - ch_versions = Channel.empty() - ch_multiqc_files = Channel.empty() + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() - ch_input_for_fastp = reads - .branch{ - single: it[0]['single_end'] == true - paired: it[0]['single_end'] == false - } + ch_input_for_fastp = reads.branch { + single: it[0]['single_end'] == true + paired: it[0]['single_end'] == false + } - FASTP_SINGLE ( ch_input_for_fastp.single, adapterlist, false, false ) + FASTP_SINGLE(ch_input_for_fastp.single, adapterlist, false, false) ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first()) - ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json ) + ch_multiqc_files = ch_multiqc_files.mix(FASTP_SINGLE.out.json) // Last parameter here turns on merging of PE data - FASTP_PAIRED ( ch_input_for_fastp.paired, adapterlist, false, !params.preprocessing_skippairmerging ) + FASTP_PAIRED(ch_input_for_fastp.paired, adapterlist, false, !params.preprocessing_skippairmerging) ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first()) - ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json ) - - if ( !params.preprocessing_skippairmerging ) { - ch_fastp_reads_prepped_pe = FASTP_PAIRED.out.reads_merged - .map { - meta, reads -> - def meta_new = meta.clone() - meta_new['single_end'] = true - [ meta_new, [ reads ].flatten() ] - } - - ch_fastp_reads_prepped = ch_fastp_reads_prepped_pe.mix( FASTP_SINGLE.out.reads ) - - } else { - ch_fastp_reads_prepped = FASTP_PAIRED.out.reads - .mix( FASTP_SINGLE.out.reads ) + ch_multiqc_files = ch_multiqc_files.mix(FASTP_PAIRED.out.json) + + if (!params.preprocessing_skippairmerging) { + ch_fastp_reads_prepped_pe = FASTP_PAIRED.out.reads_merged.map { meta, reads -> + def meta_new = meta.clone() + meta_new['single_end'] = true + [meta_new, [reads].flatten()] + } + + ch_fastp_reads_prepped = ch_fastp_reads_prepped_pe.mix(FASTP_SINGLE.out.reads) + } + else { + ch_fastp_reads_prepped = FASTP_PAIRED.out.reads.mix(FASTP_SINGLE.out.reads) } emit: - reads = ch_fastp_reads_prepped // channel: [ val(meta), [ reads ] ] - versions = ch_versions // channel: [ versions.yml ] + reads = ch_fastp_reads_prepped // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] mqc = ch_multiqc_files } diff --git a/subworkflows/local/reference_indexing.nf b/subworkflows/local/reference_indexing.nf index 0080c75cf..f5e814f7d 100644 --- a/subworkflows/local/reference_indexing.nf +++ b/subworkflows/local/reference_indexing.nf @@ -19,6 +19,7 @@ workflow REFERENCE_INDEXING { main: ch_versions = Channel.empty() + println "start indexing reference" // Warn user if they've given a reference sheet that already includes fai/dict/mapper index etc. if ( ( fasta.extension == 'csv' || fasta.extension == 'tsv' ) && ( fasta_fai || fasta_dict || fasta_mapperindexdir )) log.warn("A TSV or CSV has been supplied to `--fasta_sheet` as well as e.g. `--fasta_fai`. --fasta_sheet CSV/TSV takes priority and --fasta_* parameters will be ignored.") if ( ( fasta.extension == 'csv' || fasta.extension == 'tsv' ) && ( params.mitochondrion_header || params.contamination_estimation_angsd_hapmap || params.damage_manipulation_pmdtools_reference_mask || params.damage_manipulation_pmdtools_reference_mask || params.snpcapture_bed || params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile || params.sexdeterrmine_bedfile || params.mapstats_bedtools_featurefile || params.genotyping_reference_ploidy || params.genotyping_gatk_dbsnp || params.fasta_circular_target || params.circularmapper_elongated_fasta || params.circularmapper_elongated_fai )) log.warn("A TSV or CSV has been supplied to `--fasta_sheet` as well as individual reference-specific input files, e.g. `--contamination_estimation_angsd_hapmap`. Input files specified in the --fasta_sheet CSV/TSV take priority and other input parameters will be ignored.") @@ -37,6 +38,7 @@ workflow REFERENCE_INDEXING { ch_sexdeterrmine_bed = REFERENCE_INDEXING_MULTI.out.sexdeterrmine_bed ch_bedtools_feature = REFERENCE_INDEXING_MULTI.out.bedtools_feature ch_dbsnp = REFERENCE_INDEXING_MULTI.out.dbsnp + ch_mva = REFERENCE_INDEXING_MULTI.out.mva ch_versions = ch_versions.mix( REFERENCE_INDEXING_MULTI.out.versions ) } else { // If input FASTA and/or indicies supplied @@ -52,6 +54,7 @@ workflow REFERENCE_INDEXING { ch_bedtools_feature = REFERENCE_INDEXING_SINGLE.out.bedtools_feature ch_reference_for_mapping = REFERENCE_INDEXING_SINGLE.out.reference ch_dbsnp = REFERENCE_INDEXING_SINGLE.out.dbsnp + ch_mva = REFERENCE_INDEXING_SINGLE.out.mva ch_versions = ch_versions.mix( REFERENCE_INDEXING_SINGLE.out.versions ) } @@ -149,6 +152,7 @@ workflow REFERENCE_INDEXING { ch_elongated_indexed_reference = ch_reference_to_elongate ch_elongated_chr_list = Channel.empty() } + println "end indexing reference" emit: reference = ch_reference_for_mapping // [ meta, fasta, fai, dict, mapindex ] @@ -163,6 +167,7 @@ workflow REFERENCE_INDEXING { sexdeterrmine_bed = ch_sexdeterrmine_bed // [ meta, sexdet_bed ] bedtools_feature = ch_bedtools_feature // [ meta, bedtools_feature ] dbsnp = ch_dbsnp // [ meta, dbsnp ] + mva = ch_mva // [ meta, consensus_multivcfanalyzer_reference_gff_annotations, consensus_multivcfanalyzer_reference_gff_exclude, consensus_multivcfanalyzer_reference_snpeff_results ] versions = ch_versions } diff --git a/subworkflows/local/reference_indexing_multi.nf b/subworkflows/local/reference_indexing_multi.nf index 0bab9e1dc..eedc89e16 100644 --- a/subworkflows/local/reference_indexing_multi.nf +++ b/subworkflows/local/reference_indexing_multi.nf @@ -38,7 +38,10 @@ workflow REFERENCE_INDEXING_MULTI { sexdet_bed = sexdet_bed != [] ? sexdet_bed : "" bedtools_feature = bedtools_feature != [] ? bedtools_feature : "" genotyping_gatk_dbsnp = genotyping_gatk_dbsnp != [] ? genotyping_gatk_dbsnp : "" - [meta - meta.subMap('genotyping_ploidy'), fasta, fai, dict, mapper_index, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp] + consensus_multivcfanalyzer_reference_gff_annotations = consensus_multivcfanalyzer_reference_gff_annotations != [] ? consensus_multivcfanalyzer_reference_gff_annotations : "" + consensus_multivcfanalyzer_reference_gff_exclude = consensus_multivcfanalyzer_reference_gff_exclude != [] ? consensus_multivcfanalyzer_reference_gff_exclude : "" + consensus_multivcfanalyzer_reference_snpeff_results = consensus_multivcfanalyzer_reference_snpeff_results != [] ? consensus_multivcfanalyzer_reference_gff_exclude : "" + [meta - meta.subMap('genotyping_ploidy'), fasta, fai, dict, mapper_index, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp, consensus_multivcfanalyzer_reference_gff_annotations, consensus_multivcfanalyzer_reference_gff_exclude, consensus_multivcfanalyzer_reference_snpeff_results ] } // GENERAL DESCRIPTION FOR NEXT SECTIONS @@ -52,7 +55,7 @@ workflow REFERENCE_INDEXING_MULTI { // DECOMPRESSION // - ch_input_from_referencesheet = ch_splitreferencesheet_for_branch.multiMap { meta, fasta, fai, dict, mapper_index, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp -> + ch_input_from_referencesheet = ch_splitreferencesheet_for_branch.multiMap { meta, fasta, fai, dict, mapper_index, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp, consensus_multivcfanalyzer_reference_gff_annotations, consensus_multivcfanalyzer_reference_gff_exclude, consensus_multivcfanalyzer_reference_snpeff_results -> generated: [meta, fasta, fai, dict, mapper_index] circularmapper: [meta, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex] mitochondrion_header: [meta, mitochondrion] @@ -64,6 +67,7 @@ workflow REFERENCE_INDEXING_MULTI { sexdeterrmine_bed: [meta, sexdet_bed] bedtools_feature: [meta, bedtools_feature] dbsnp: [meta, genotyping_gatk_dbsnp] + mva: [meta, consensus_multivcfanalyzer_reference_gff_annotations, consensus_multivcfanalyzer_reference_gff_exclude, consensus_multivcfanalyzer_reference_snpeff_results] } // Detect if fasta is gzipped or not @@ -194,5 +198,6 @@ workflow REFERENCE_INDEXING_MULTI { sexdeterrmine_bed = ch_input_from_referencesheet.sexdeterrmine_bed // [ meta, sexdet_bed ] bedtools_feature = ch_input_from_referencesheet.bedtools_feature // [ meta, bedtools_feature ] dbsnp = ch_input_from_referencesheet.dbsnp // [ meta, genotyping_gatk_dbsnp ] + mva = ch_input_from_referencesheet.mva versions = ch_versions } diff --git a/subworkflows/local/reference_indexing_single.nf b/subworkflows/local/reference_indexing_single.nf index d98834376..b66bec224 100644 --- a/subworkflows/local/reference_indexing_single.nf +++ b/subworkflows/local/reference_indexing_single.nf @@ -25,6 +25,7 @@ workflow REFERENCE_INDEXING_SINGLE { def fasta_ext = grabUngzippedExtension(fasta) def clean_name = fasta.name.toString() - fasta_ext + println "start indexing single" // Detect if fasta is gzipped or not, unzip if necessary, and generate meta ID by sanitizing file if ( fasta.extension == 'gz' ) { ch_gz_ref = Channel.fromPath(fasta).map{[[], it]} @@ -100,12 +101,15 @@ workflow REFERENCE_INDEXING_SINGLE { def genotyping_gatk_dbsnp = params.genotyping_gatk_dbsnp != null ? file(params.genotyping_gatk_dbsnp, checkIfExists: true ) : "" def circularmapper_elongated_fasta = params.fasta_circularmapper_elongatedfasta != null ? file( params.fasta_circularmapper_elongatedfasta, checkIfExists: true ) : "" def circularmapper_elongated_index = params.fasta_circularmapper_elongatedindex != null ? file( params.fasta_circularmapper_elongatedindex, checkIfExists: true ) : "" - [ meta + [ ploidy: genotyping_reference_ploidy ], fasta, fai, dict, mapper_index, params.fasta_circular_target, params.mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp, circularmapper_elongated_fasta, circularmapper_elongated_index ] + def consensus_multivcfanalyzer_reference_gff_annotations = params.consensus_multivcfanalyzer_reference_gff_annotations != null ? file(params.consensus_multivcfanalyzer_reference_gff_annotations, checkIfExists: true ) : "" + def consensus_multivcfanalyzer_reference_gff_exclude = params.consensus_multivcfanalyzer_reference_gff_exclude != null ? file(params.consensus_multivcfanalyzer_reference_gff_exclude, checkIfExists: true ) : "" + def consensus_multivcfanalyzer_reference_snpeff_results = params.consensus_multivcfanalyzer_snpeff_results != null ? file(params.consensus_multivcfanalyzer_snpeff_results, checkIfExists: true ) : "" + [ meta + [ ploidy: genotyping_reference_ploidy ], fasta, fai, dict, mapper_index, params.fasta_circular_target, params.mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp, circularmapper_elongated_fasta, circularmapper_elongated_index, consensus_multivcfanalyzer_reference_gff_annotations, consensus_multivcfanalyzer_reference_gff_exclude, consensus_multivcfanalyzer_reference_snpeff_results ] } ch_ref_index_single = ch_reference_for_mapping .multiMap{ - meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp, circularmapper_elongated_fasta, circularmapper_elongated_index -> + meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp, circularmapper_elongated_fasta, circularmapper_elongated_index, consensus_multivcfanalyzer_reference_gff_annotations, consensus_multivcfanalyzer_reference_gff_exclude, consensus_multivcfanalyzer_reference_snpeff_results -> reference: [ meta, fasta, fai, dict, mapper_index ] circularmapper: [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_index ] mito_header: [ meta, mitochondrion_header ] @@ -117,8 +121,11 @@ workflow REFERENCE_INDEXING_SINGLE { sexdeterrmine_bed: [ meta, sexdet_bed ] bedtools_feature: [ meta, bedtools_feature ] dbsnp: [ meta, genotyping_gatk_dbsnp ] + mva: [ meta, consensus_multivcfanalyzer_reference_gff_annotations, consensus_multivcfanalyzer_reference_gff_exclude, consensus_multivcfanalyzer_reference_snpeff_results ] } + println "End single indexing" + emit: reference = ch_ref_index_single.reference // [ meta, fasta, fai, dict, mapindex ] elongated_reference = ch_ref_index_single.circularmapper // [ meta, circular_target, circularmapper_elongated_fasta, circularmapper_elongated_index ] @@ -131,6 +138,7 @@ workflow REFERENCE_INDEXING_SINGLE { sexdeterrmine_bed = ch_ref_index_single.sexdeterrmine_bed // [ meta, sexdet_bed ] bedtools_feature = ch_ref_index_single.bedtools_feature // [ meta, bedtools_feature ] dbsnp = ch_ref_index_single.dbsnp // [ meta, genotyping_gatk_dbsnp ] + mva = ch_ref_index_single.mva // [ meta, consensus_multivcfanalyzer_reference_gff_annotations, consensus_multivcfanalyzer_reference_gff_exclude, consensus_multivcfanalyzer_reference_snpeff_results ] versions = ch_versions } diff --git a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf index b562b355b..9d9c759e8 100644 --- a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf @@ -103,40 +103,53 @@ workflow PIPELINE_INITIALISATION { // ch_samplesheet = channel.fromList(samplesheetToList(input, "${projectDir}/assets/schema_input.json")) .map { - meta, r1, r2, bam -> + meta, r1, r2, bam, vcf -> meta.single_end = meta.pairment == "single" ? true : false meta.id = meta.sample_id - [ meta, r1, r2, bam ] + [ meta, r1, r2, bam, vcf ] } ch_samplesheet_for_branch = ch_samplesheet .branch { - meta, r1, r2, bam -> + meta, r1, r2, bam, vcf -> bam: bam.toString().endsWith(".bam") + vcf: vcf.toString().endsWith(".vcf.gz") fastq: true } ch_samplesheet_fastqs = ch_samplesheet_for_branch.fastq .map { - meta, r1, r2, bam -> + meta, r1, r2, bam, vcf -> reads = meta.single_end ? [ r1 ] : [ r1, r2 ] [ meta - meta.subMap('pairment', 'bam_reference_id'), reads ] } + .dump(tag:"fastq_samplesheet") ch_samplesheet_bams = ch_samplesheet_for_branch.bam .map { - meta, r1, r2, bam -> + meta, r1, r2, bam, vcf -> meta.reference = meta.bam_reference_id meta.id_index = meta.bam_reference_id [ meta - meta.subMap('pairment', 'bam_reference_id'), bam ] } + .dump(tag:"bams_samplesheet") + + ch_samplesheet_vcfs = ch_samplesheet_for_branch.vcf + .map { + meta, r1, r2, bam, vcf -> + meta.reference = meta.vcf_reference_id + meta.id_index = meta.vcf_reference_id + [ meta - meta.subMap('pairment', 'vcf_reference_id'), vcf ] + } + .dump(tag: "additional_vcfs_samplesheet") + // Extra validation // - Only paired end specified when R2 provided // - No single-ended data allowed when using dedup ch_samplesheet_for_branch.fastq .map { - meta, r1, r2, bam -> + meta, r1, r2, bam, vcf -> if ( meta.pairment == "single" && r2 != [] ) { exit 1, "[nf-core] ERROR: Validation of 'input' file failed. Reads 2 cannot be provided when sequencing pairment is set to 'single'." } @@ -146,23 +159,33 @@ workflow PIPELINE_INITIALISATION { if ( meta.pairment == "single" && params.deduplication_tool == "dedup" ) { exit 1, "[nf-core] ERROR: Invalid input/parameter combination. '--deduplication_tool' cannot be 'dedup' on runs that include SE data. Use 'markduplicates' for runs with both SE and PE data or separate SE and PE data into separate runs." } - [ meta, r1, r2, bam ] + [ meta, r1, r2, bam, vcf ] } // - Only single-ended specified for BAM files ch_samplesheet_for_branch.bam .map { - meta, r1, r2, bam -> + meta, r1, r2, bam, vcf -> if ( meta.pairment == "paired" && bam != [] ) { exit 1, "[nf-core] ERROR: Validation of 'input' file failed. Sequencing pairment has to be 'single' when BAM files are provided." } - [ meta, r1, r2, bam ] + [ meta, r1, r2, bam, vcf ] + } + + // - Only single-ended specified for VCF files + ch_samplesheet_for_branch.vcf + .map { + meta, r1, r2, bam, vcf -> + if ( meta.pairment == "paired" && vcf != [] ) { + exit 1, "[nf-core] ERROR: Validation of 'input' file failed. Sequencing pairment has to be 'single' when VCF files are provided." + } + [ meta, r1, r2, bam, vcf ] } // - No single- and double-stranded libraries with same sample ID ch_samplesheet_test = ch_samplesheet .map { - meta, r1, r2, bam -> + meta, r1, r2, bam, vcf -> [ meta.subMap('sample_id'), meta.subMap('strandedness') ] } .groupTuple() @@ -176,6 +199,7 @@ workflow PIPELINE_INITIALISATION { emit: samplesheet_fastqs = ch_samplesheet_fastqs samplesheet_bams = ch_samplesheet_bams + samplesheet_vcfs = ch_samplesheet_vcfs versions = ch_versions } diff --git a/workflows/eager.nf b/workflows/eager.nf index e613d5cd5..43ed93f58 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -35,6 +35,7 @@ include { RUN_SEXDETERRMINE } from '../subwork include { MERGE_LIBRARIES } from '../subworkflows/local/merge_libraries' include { MERGE_LIBRARIES as MERGE_LIBRARIES_GENOTYPING } from '../subworkflows/local/merge_libraries' include { GENOTYPE } from '../subworkflows/local/genotype' +include { CONSENSUS_SEQUENCE } from '../subworkflows/local/consensus_sequence' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -73,6 +74,7 @@ workflow EAGER { take: ch_samplesheet_fastqs // channel: samplesheet FASTQ entries read in from --input ch_samplesheet_bams // channel: samplesheet BAM entries read in from --input + ch_samplesheet_vcfs // channel: samplesheet VCFs entries read in from --input main: @@ -146,6 +148,8 @@ workflow EAGER { REFERENCE_INDEXING(fasta_fn, fasta_fai, fasta_dict, fasta_mapperindexdir) ch_versions = ch_versions.mix(REFERENCE_INDEXING.out.versions) + REFERENCE_INDEXING.out.reference.dump(tag: "indexing_reference") + // // MODULE: Run FastQC or Falco // @@ -205,8 +209,7 @@ workflow EAGER { // SUBWORKFLOW: Merging lanes for ch_bams_from_input MERGE_LANES_INPUTBAM(ch_bams_from_input) - ch_bams_from_input_lanemerged = MERGE_LANES_INPUTBAM.out.bam - .join(MERGE_LANES_INPUTBAM.out.bai) + ch_bams_from_input_lanemerged = MERGE_LANES_INPUTBAM.out.bam.join(MERGE_LANES_INPUTBAM.out.bai) ch_flagstat_bams_from_input_lanemerged = MERGE_LANES_INPUTBAM.out.flagstat } else { @@ -222,8 +225,8 @@ workflow EAGER { if (params.run_bamfiltering || params.run_metagenomics) { ch_mapped_for_bamfilter = MAP.out.bam - .join(MAP.out.bai) - .mix(ch_bams_from_input_lanemerged) + .join(MAP.out.bai) + .mix(ch_bams_from_input_lanemerged) FILTER_BAM(ch_mapped_for_bamfilter) ch_bamfiltered_for_deduplication = FILTER_BAM.out.genomics ch_bamfiltered_for_metagenomics = FILTER_BAM.out.metagenomics @@ -232,8 +235,8 @@ workflow EAGER { } else { ch_bamfiltered_for_deduplication = MAP.out.bam - .join(MAP.out.bai) - .mix(ch_bams_from_input_lanemerged) + .join(MAP.out.bai) + .mix(ch_bams_from_input_lanemerged) } ch_reads_for_deduplication = ch_bamfiltered_for_deduplication @@ -391,8 +394,7 @@ workflow EAGER { // MODULE: ENDORSPY (raw, filtered, deduplicated) // - ch_flagstat_for_endorspy_raw = MAP.out.flagstat - .mix( ch_flagstat_bams_from_input_lanemerged ) + ch_flagstat_for_endorspy_raw = MAP.out.flagstat.mix(ch_flagstat_bams_from_input_lanemerged) if (params.run_bamfiltering & !params.skip_deduplication) { ch_for_endorspy = ch_flagstat_for_endorspy_raw @@ -560,6 +562,21 @@ workflow EAGER { ch_multiqc_files = ch_multiqc_files.mix(GENOTYPE.out.mqc.collect { it[1] }.ifEmpty([])) } + // + // SUBWORKFLOW: Consensus sequence + // + ch_samplesheet_vcfs.dump(tag: "vcfs_additional_samplesheet") + if (params.run_consensus_sequence) { + CONSENSUS_SEQUENCE( + GENOTYPE.out.vcf, + ch_samplesheet_vcfs, + REFERENCE_INDEXING.out.mva, + REFERENCE_INDEXING.out.reference, + ) + } + + + // // Collate and save software versions //