nf-core · maxulysse · Jul 15, 2021 · Jun 15, 2021 · Jun 16, 2021 · Jun 16, 2021
diff --git a/conf/modules.config b/conf/modules.config
@@ -57,6 +57,10 @@ params {
             publish_files    = false
         }
 // MAPPING
+        'seqkit_split2' {
+            args             = "--by-size ${params.split_fastq}"
+            publish_files    = false
+        }
         'bwa_mem1_mem' {
             args             = '-K 100000000 -M'
             args2            = 'sort'
@@ -77,6 +81,28 @@ params {
             args2            = 'sort'
             publish_files    = false
         }
+// MARKDUPLICATES
+        'markduplicates' {
+            args             = 'REMOVE_DUPLICATES=false VALIDATION_STRINGENCY=LENIENT'
+            suffix           = '.md'
+            publish_by_meta  = true
+            publish_dir      = 'preprocessing'
+            publish_files    = false
+        }
+        'markduplicatesspark' {
+            args             = '--remove-sequencing-duplicates false -VS LENIENT'
+            suffix           = '.md'
+            publish_by_meta  = true
+            publish_dir      = 'preprocessing'
+            publish_files    = ['cram': 'markduplicates', 'crai': 'markduplicates']
+        }
+        'estimatelibrarycomplexity' {
+            args             = ''
+            suffix           = '.md'
+            publish_by_meta  = true
+            publish_dir      = 'preprocessing'
+            publish_files    = ['metrics': 'markduplicates']
+        }
         'merge_bam_mapping' {
             publish_by_meta  = true
             publish_files    = ['bam':'mapped']
@@ -87,36 +113,32 @@ params {
             publish_by_meta  = true
             publish_dir      = 'reports/qualimap'
         }
-        'samtools_index_mapping' {
-            publish_by_meta  = true
-            publish_files    = ['bai':'mapped']
-            publish_dir      = 'preprocessing'
-        }
         'samtools_stats_mapping' {
             publish_by_meta  = true
             publish_dir      = 'reports/samtools_stats'
         }
-// MARKDUPLICATES
-        'markduplicates' {
-            args             = 'REMOVE_DUPLICATES=false VALIDATION_STRINGENCY=LENIENT'
+        'samtools_view' {
             suffix           = '.md'
             publish_by_meta  = true
             publish_dir      = 'preprocessing'
-            publish_files    = ['bam': 'markduplicates', 'bai': 'markduplicates']
+            publish_files    = ['cram': 'markduplicates', 'crai': 'markduplicates']
         }
-        'markduplicatesspark' {
-            args             = '--remove-sequencing-duplicates false -VS LENIENT'
-            suffix           = '.md'
+        'samtools_index_cram' {
             publish_by_meta  = true
+            publish_files    = ['crai':'sth']
             publish_dir      = 'preprocessing'
-            publish_files    = ['bam': 'markduplicates', 'bai': 'markduplicates']
         }
 // PREPARE_RECALIBRATION
         'baserecalibrator' {
             publish_by_meta  = true
             publish_dir      = 'preprocessing'
             publish_files    = ['recal.table': 'recal_table']
         }
+        'baserecalibrator_spark' {
+            publish_by_meta  = true
+            publish_dir      = 'preprocessing'
+            publish_files    = ['recal.table': 'recal_table']
+        }
         'gatherbqsrreports' {
             publish_by_meta  = true
             publish_dir      = 'preprocessing'
@@ -127,10 +149,14 @@ params {
             suffix           = '.recal'
             publish_files    = false
         }
-        'merge_bam_recalibrate' {
+        'applybqsr_spark' {
+            suffix           = '.recal'
+            publish_files    = false
+        }
+        'merge_cram_recalibrate' {
             suffix           = '.recal'
             publish_by_meta  = true
-            publish_files    = ['bam':'recalibrated']
+            publish_files    = ['cram':'recalibrated']
             publish_dir      = 'preprocessing'
         }
         'qualimap_bamqc_recalibrate' {
@@ -142,7 +168,7 @@ params {
             suffix           = 'recal'
             publish_by_meta  = true
             publish_dir      = 'preprocessing'
-            publish_files    = ['recal.bam':'recalibrated', 'recal.bam.bai':'recalibrated']
+            publish_files    = ['recal.cram':'recalibrated', 'recal.cram.crai':'recalibrated']
         }
         'samtools_stats_recalibrate' {
             publish_by_meta  = true
@@ -186,7 +212,7 @@ params {
         }
 
 // TUMOR_VARIANT_CALLING
-
+//
 // PAIR_VARIANT_CALLING
         'manta_somatic' {
             publish_by_meta  = true
@@ -208,6 +234,11 @@ params {
             publish_dir      = 'variant_calling'
             publish_files    = ['vcf.gz':'strelka', 'vcf.gz.tbi':'strelka']
         }
+        'mutect2_somatic' {
+            publish_by_meta  = true
+            publish_dir      = 'variant_calling'
+            publish_files    = ['vcf.gz':'mutect2', 'vcf.gz.tbi':'mutect2']
+        }
 // ANNOTATE
         'snpeff' {
             args             = '-nodownload -canon -v'

diff --git a/conf/test.config b/conf/test.config
@@ -45,7 +45,7 @@ profiles {
     params.save_bam_mapped     = true
   }
   split_fastq {
-    params.split_fastq         = 500
+    params.split_fastq         = 2
   }
   targeted {
     params.target_bed          = 'https://raw.githubusercontent.com/nf-core/test-datasets/sarek/testdata/target.bed'
@@ -67,7 +67,7 @@ profiles {
     params.trim_fastq          = true
   }
   use_gatk_spark {
-    params.use_gatk_spark      = true
+    params.use_gatk_spark      = 'markduplicates,bqsr'
   }
   umi_quiaseq {
     params.genome              = 'smallGRCh38'

diff --git a/docs/usage.md b/docs/usage.md
@@ -98,7 +98,7 @@ This is _not_ recommended.
   * Specify `--use_gatk_spark`
 * `test_split_fastq`
   * A profile with a complete configuration for automated testing
-  * Specify `--split_fastq 500`
+  * Specify `--split_fastq 2`
 * `test_targeted`
   * A profile with a complete configuration for automated testing
   * Include link to a target `BED` file and use `Manta` and `Strelka` for Variant Calling

diff --git a/modules/local/concat_vcf/main.nf b/modules/local/concat_vcf/main.nf
@@ -13,11 +13,13 @@ process CONCAT_VCF {
 
     conda (params.enable_conda ? "bioconda::htslib=1.12" : null)
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/htslib:1.12--hd3b49d5_0"
+        //TODO: No singularity container at the moment, use docker container for the moment
+        container "quay.io/biocontainers/htslib:1.12--h9093b5e_1"
     } else {
         container "quay.io/biocontainers/htslib:1.12--hd3b49d5_0"
     }
 
+
     input:
     tuple val(meta), path(vcf)
     path fai

diff --git a/modules/local/index_target_bed/main.nf b/modules/local/index_target_bed/main.nf
@@ -0,0 +1,33 @@
+// Import generic module functions
+include { initOptions; saveFiles; getSoftwareName } from './functions'
+
+params.options = [:]
+options        = initOptions(params.options)
+
+process INDEX_TARGET_BED {
+    tag "$target_bed"
+    label 'process_medium'
+    publishDir "${params.outdir}",
+        mode: params.publish_dir_mode,
+        saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) }
+
+    conda (params.enable_conda ? "bioconda::htslib=1.12" : null)
+    if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+        //TODO: No singularity container at the moment, use docker container for the moment
+        container "quay.io/biocontainers/htslib:1.12--h9093b5e_1"
+    } else {
+        container "quay.io/biocontainers/htslib:1.12--hd3b49d5_0"
+    }
+
+    input:
+    path target_bed
+
+    output:
+    tuple path("${target_bed}.gz"), path("${target_bed}.gz.tbi")
+
+    script:
+    """
+    bgzip --threads ${task.cpus} -c ${target_bed} > ${target_bed}.gz
+    tabix ${target_bed}.gz
+    """
+}
diff --git a/modules/nf-core/software/bwa/mem/main.nf b/modules/nf-core/software/bwa/mem/main.nf
@@ -29,8 +29,15 @@ process BWA_MEM {
     script:
     def split_cpus = Math.floor(task.cpus/2)
     def software   = getSoftwareName(task.process)
-    def prefix     = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}"
+    def part       = params.split_fastq > 1 ? reads.get(0).name.findAll(/part_([0-9]+)?/).last().concat('.') : ""
+    def prefix     = options.suffix ? "${meta.id}${options.suffix}.${part}" : "${meta.id}.${part}"
     def read_group = meta.read_group ? "-R ${meta.read_group}" : ""
+
+    //MD Spark NEEDS name sorted reads or runtime goes through the roof.
+    //However, if duplicate marking is skipped, reads need to be coordinate sorted.
+    //Spark can be used also for BQSR, therefore check for both: only name sort if spark + duplicate marking is done
+    def sort_order = ('markduplicates' in params.use_gatk_spark) & !params.skip_markduplicates ? "-n" : ""
+
     """
     INDEX=`find -L ./ -name "*.amb" | sed 's/.amb//'`
 
@@ -40,7 +47,7 @@ process BWA_MEM {
         -t ${split_cpus} \\
         \$INDEX \\
         $reads \\
-        | samtools $options.args2 --threads ${split_cpus} -o ${prefix}.bam -
+        | samtools $options.args2 $sort_order --threads ${split_cpus} -o ${prefix}bam -
 
     echo \$(bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//' > ${software}.version.txt
     """

diff --git a/modules/nf-core/software/bwamem2/mem/main.nf b/modules/nf-core/software/bwamem2/mem/main.nf
@@ -29,8 +29,15 @@ process BWAMEM2_MEM {
     script:
     def split_cpus = Math.floor(task.cpus/2)
     def software   = getSoftwareName(task.process)
-    def prefix     = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}"
+    def part       = params.split_fastq > 1 ? reads.get(0).name.findAll(/part_([0-9]+)?/).last().concat('.') : ""
+    def prefix     = options.suffix ? "${meta.id}${options.suffix}.${part}" : "${meta.id}.${part}"
     def read_group = meta.read_group ? "-R ${meta.read_group}" : ""
+
+    //MD Spark NEEDS name sorted reads or runtime goes through the roof.
+    //However, if duplicate marking is skipped, reads need to be coordinate sorted.
+    //Spark can be used also for BQSR, therefore check for both: only name sort if spark + duplicate marking is done
+    def sort_order = ('markduplicates' in params.use_gatk_spark) & !params.skip_markduplicates ? "-n" : ""
+
     """
     INDEX=`find -L ./ -name "*.amb" | sed 's/.amb//'`
 
@@ -40,7 +47,7 @@ process BWAMEM2_MEM {
         -t ${split_cpus} \\
         \$INDEX \\
         $reads \\
-        | samtools $options.args2 -@ ${split_cpus} -o ${prefix}.bam -
+        | samtools $options.args2 $sort_order -@ ${split_cpus} -o ${prefix}bam -
 
     echo \$(bwa-mem2 version 2>&1) > ${software}.version.txt
     """

diff --git a/modules/nf-core/software/freebayes/freebayes.nf b/modules/nf-core/software/freebayes/freebayes.nf
@@ -0,0 +1,38 @@
+// Import generic module functions
+include { initOptions; saveFiles; getSoftwareName } from './functions'
+
+params.options = [:]
+options        = initOptions(params.options)
+
+process FREEBAYES {
+    tag "$meta.id"
+    label 'process_low'
+    publishDir "${params.outdir}",
+        mode: params.publish_dir_mode,
+        saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) }
+
+    conda (params.enable_conda ? "bioconda::freebayes=1.3.5" : null)
+    if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+        container "https://depot.galaxyproject.org/singularity/freebayes:1.3.5--py38ha193a2f_3"
+    } else {
+        container "quay.io/biocontainers/freebayes:1.3.5--py38ha193a2f_3"
+    }
+
+    input:
+    tuple val(meta), path(cram), path(crai)
+
+    output:
+    // TODO nf-core: Named file extensions MUST be emitted for ALL output channels
+    tuple val(meta), path("*.bam"), emit: bam
+    // TODO nf-core: List additional required output channels/values here
+    path "*.version.txt"          , emit: version
+
+    script:
+    def software = getSoftwareName(task.process)
+    def prefix   = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}"
+    """
+
+
+    echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' > ${software}.version.txt
+    """
+}
diff --git a/modules/nf-core/software/freebayes/functions.nf b/modules/nf-core/software/freebayes/functions.nf
@@ -0,0 +1,68 @@
+//
+//  Utility functions used in nf-core DSL2 module files
+//
+
+//
+// Extract name of software tool from process name using $task.process
+//
+def getSoftwareName(task_process) {
+    return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()
+}
+
+//
+// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules
+//
+def initOptions(Map args) {
+    def Map options = [:]
+    options.args            = args.args ?: ''
+    options.args2           = args.args2 ?: ''
+    options.args3           = args.args3 ?: ''
+    options.publish_by_meta = args.publish_by_meta ?: []
+    options.publish_dir     = args.publish_dir ?: ''
+    options.publish_files   = args.publish_files
+    options.suffix          = args.suffix ?: ''
+    return options
+}
+
+//
+// Tidy up and join elements of a list to return a path string
+//
+def getPathFromList(path_list) {
+    def paths = path_list.findAll { item -> !item?.trim().isEmpty() }      // Remove empty entries
+    paths     = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes
+    return paths.join('/')
+}
+
+//
+// Function to save/publish module results
+//
+def saveFiles(Map args) {
+    if (!args.filename.endsWith('.version.txt')) {
+        def ioptions  = initOptions(args.options)
+        def path_list = [ ioptions.publish_dir ?: args.publish_dir ]
+        if (ioptions.publish_by_meta) {
+            def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta
+            for (key in key_list) {
+                if (args.meta && key instanceof String) {
+                    def path = key
+                    if (args.meta.containsKey(key)) {
+                        path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key]
+                    }
+                    path = path instanceof String ? path : ''
+                    path_list.add(path)
+                }
+            }
+        }
+        if (ioptions.publish_files instanceof Map) {
+            for (ext in ioptions.publish_files) {
+                if (args.filename.endsWith(ext.key)) {
+                    def ext_list = path_list.collect()
+                    ext_list.add(ext.value)
+                    return "${getPathFromList(ext_list)}/$args.filename"
+                }
+            }
+        } else if (ioptions.publish_files == null) {
+            return "${getPathFromList(path_list)}/$args.filename"
+        }
+    }
+}