diff --git a/CHANGELOG.md b/CHANGELOG.md index 74a568496c..920d9776f5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#671](https://github.com/SciLifeLab/Sarek/pull/671) - publishDir modes are now params - [#677](https://github.com/SciLifeLab/Sarek/pull/677) - Update docs - [#679](https://github.com/SciLifeLab/Sarek/pull/679) - Update old awsbatch configuration +- [#682](https://github.com/SciLifeLab/Sarek/pull/682) - Specifications for memory and cpus for awsbatch - [#693](https://github.com/SciLifeLab/Sarek/pull/693) - Qualimap bamQC is now ran after mapping and after recalibration for better QC - [#700](https://github.com/SciLifeLab/Sarek/pull/700) - Update GATK to `4.0.9.0` - [#702](https://github.com/SciLifeLab/Sarek/pull/702) - update FastQC to `0.11.8` diff --git a/conf/aws-batch.config b/conf/aws-batch.config index 0293d684bb..33850cd686 100644 --- a/conf/aws-batch.config +++ b/conf/aws-batch.config @@ -10,6 +10,8 @@ params { genome_base = params.genome == 'GRCh37' ? "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh37" : params.genome == 'GRCh38' ? "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh38" : "s3://sarek-references/small" publishDirMode = 'copy' + singleCPUMem = 7.GB // To make the uppmax slurm copy paste work. + localReportDir = 'Reports' } executor { @@ -17,6 +19,12 @@ executor { awscli = '/home/ec2-user/miniconda/bin/aws' } +/* Rolling files are currently not supported on s3 */ +report.file = "${params.localReportDir}/Sarek_report.html" +timeline.file = "${params.localReportDir}/Sarek_timeline.html" +dag.file = "${params.localReportDir}/Sarek_DAG.svg" +trace.file = "${params.localReportDir}/Sarek_trace.txt" + process { queue = params.awsqueue @@ -26,4 +34,29 @@ process { cpus = 2 memory = 8.GB + withName:RunBcftoolsStats { + cpus = 1 + memory = {params.singleCPUMem * 2} // Memory is doubled so that it won't run two on the same instance + // Use a tiny queue for this one, so storage doesn't run out + queue = params.awsqueue_tiny + } + withName:RunVcftools { + cpus = 1 + memory = {params.singleCPUMem * 2} // Memory is doubled so that it won't run two on the same instance + // Use a tiny queue for this one, so storage doesn't run out + queue = params.awsqueue_tiny + } + withName:RunHaplotypecaller { + cpus = 1 + // Increase memory quadratically + memory = {params.singleCPUMem * 2} // Memory is doubled so that it won't run two on the same instance + // Use a tiny queue for this one, so storage doesn't run out + queue = params.awsqueue_tiny + } + withName:RunGenotypeGVCFs { + cpus = 1 + memory = {params.singleCPUMem * 2} // Memory is doubled so that it won't run two on the same instance + // Use a tiny queue for this one, so storage doesn't run out + queue = params.awsqueue_tiny + } } diff --git a/conf/base.config b/conf/base.config index 3b19264e30..b890e8b076 100644 --- a/conf/base.config +++ b/conf/base.config @@ -38,6 +38,8 @@ params { test = false // Not testing by default verbose = false // Enable for more verbose information awsqueue = false // Queue has to be provided when using awsbatch executor + awsqueue_tiny = params.awsqueue // A separate queue with smaller instance types + localReportDir = false // Used by AWS since reporting is not fully supported on s3 buckets } process { @@ -67,6 +69,6 @@ dag { // Turning on dag by default trace { // Turning on trace tracking by default enabled = true - fields = 'process,task_id,hash,name,attempt,status,exit,realtime,%cpu,vmem,rss,submit,start,complete,duration,realtime,rchar,wchar' + fields = 'process,task_id,hash,name,attempt,status,exit,realtime,cpus,memory,%cpu,vmem,rss,submit,start,complete,duration,realtime,rchar,wchar' file = "${params.outDir}/Reports/Sarek_trace.txt" } diff --git a/conf/resources.config b/conf/resources.config index f7d5c8c470..1a7dc84a77 100644 --- a/conf/resources.config +++ b/conf/resources.config @@ -21,25 +21,27 @@ process { withName:MapReads { memory = { check_max( 60.GB * task.attempt, 'memory' ) } - cpus = { check_max( 10, 'cpus' ) } + cpus = { check_max( 16, 'cpus' ) } } withName:CreateRecalibrationTable { - cpus = { check_max( 12, 'cpus' ) } - memory = {params.singleCPUMem * 8 * task.attempt} + cpus = { check_max( 1, 'cpus' ) } + memory = { check_max( 60.GB * task.attempt, 'memory') } } withName:MarkDuplicates { - // Actually the -Xmx value should be kept lower + // Actually the -Xmx value should be kept lower, + // and is set through the markdup_java_options cpus = { check_max( 8, 'cpus' ) } memory = { check_max( 8.GB * task.attempt, 'memory' ) } } withName:MergeBams { + cpus = { check_max( 4, 'cpus') } memory = {params.singleCPUMem * task.attempt} time = { check_max( 5.h * task.attempt, 'time' ) } } withName:RecalibrateBam { - cpus = { check_max( 12, 'cpus' ) } - memory = { check_max( 7.GB * 8 * task.attempt, 'memory' ) } - time = { check_max( 5.h * task.attempt, 'time' ) } + cpus = { check_max( 2, 'cpus' ) } + memory = { check_max( 7.GB * 2 * task.attempt, 'memory' ) } + time = { check_max( 10.h * task.attempt, 'time' ) } } withName:RunAlleleCount { cpus = { check_max( 1, 'cpus' ) } @@ -49,6 +51,14 @@ process { cpus = { check_max( 1, 'cpus' ) } memory = { check_max( 14.GB * task.attempt, 'memory' ) } } + withName:RunBamQCmapped { + cpus = { check_max( 6, 'cpus' ) } + memory = { check_max( 70.GB, 'memory' ) } + } + withName:RunBamQCrecalibrated { + cpus = { check_max( 6, 'cpus' ) } + memory = { check_max( 70.GB, 'memory' ) } + } withName:RunBcftoolsStats { cpus = { check_max( 1, 'cpus' ) } } @@ -65,13 +75,13 @@ process { memory = { check_max( 8.GB * task.attempt, 'memory' ) } } withName:RunHaplotypecaller { - cpus = { check_max( 20, 'cpus' ) } + cpus = { check_max( 1, 'cpus' ) } // Increase memory quadratically memory = { check_max( 7.GB * 2 * task.attempt, 'memory' ) } time = { check_max( 5.h * task.attempt, 'time' ) } } withName:RunGenotypeGVCFs { - cpus = { check_max( 20, 'cpus' ) } + cpus = { check_max( 1, 'cpus' ) } memory = { check_max( 7.GB * task.attempt, 'memory' ) } } withName:RunMultiQC { @@ -86,20 +96,24 @@ process { cpus = { check_max( 2, 'cpus' ) } time = { check_max( 5.h * task.attempt, 'time' ) } } + withName:RunSingleManta { + cpus = { check_max( 20, 'cpus' ) } + memory = { check_max( 16.GB, 'memory') } + } withName:RunSingleStrelka { + cpus = { check_max( 20, 'cpus' ) } + memory = { check_max( 16.GB, 'memory') } time = { check_max( 5.h * task.attempt, 'time' ) } } withName:RunSnpeff { cpus = { check_max( 1, 'cpus' ) } - errorStrategy = { task.exitStatus == 143 ? 'retry' : 'ignore' } } withName:RunStrelka { cpus = { check_max( 1, 'cpus' ) } time = { check_max( 5.h * task.attempt, 'time' ) } } withName:RunVEP { - cpus = { check_max( 1, 'cpus' ) } + cpus = { check_max( 16, 'cpus' ) } memory = {check_max (32.GB * task.attempt, 'memory' ) } - errorStrategy = { task.exitStatus == 143 ? 'retry' : 'ignore' } } -} \ No newline at end of file +} diff --git a/conf/uppmax-slurm.config b/conf/uppmax-slurm.config index 9be0393fa5..9e455ac55f 100644 --- a/conf/uppmax-slurm.config +++ b/conf/uppmax-slurm.config @@ -12,6 +12,9 @@ params { singleCPUMem = 7.GB // for processes that are using more memory but a single CPU only. Use the 'core' queue for these } +// Extended set of fields, e.g. native_id, cpu and memory: +trace.fields = 'process,task_id,hash,name,native_id,attempt,status,exit,realtime,cpus,memory,%cpu,vmem,rss,submit,start,complete,duration,realtime,rchar,wchar' + process { clusterOptions = {"-A $params.project"} cpus = 16 diff --git a/docs/PARAMETERS.md b/docs/PARAMETERS.md index 54c3f8a609..4255ad5aff 100644 --- a/docs/PARAMETERS.md +++ b/docs/PARAMETERS.md @@ -62,6 +62,14 @@ So you can write `--tools mutect2,ascat` or `--tools MuTect2,ASCAT` without worr Only required if you use the awsbatch profile. This parameter specifies the queue for which jobs are submitted in AWS Batch. +### --awsqueue_tiny `BatchQueueName` + +Only used if you use the awsbatch profile. This parameter specifies a queue used for certain small jobs that might still require a significant amount of disk storage. + +### --localReportDir `Directory` + +Only used if you use the awsbatch profile. This parameter specifies an output directory for nextflow reports, such as Sarek_timeline.html, which currently is not fully supported to store on s3. + ### --verbose Display more information about files being processed. diff --git a/lib/SarekUtils.groovy b/lib/SarekUtils.groovy index fbc2ba62d2..0b3931c014 100644 --- a/lib/SarekUtils.groovy +++ b/lib/SarekUtils.groovy @@ -36,6 +36,7 @@ class SarekUtils { 'annotateTools', 'annotateVCF', 'awsqueue', + 'awsqueue_tiny', 'build', 'call-name', 'callName', @@ -52,6 +53,8 @@ class SarekUtils { 'genome', 'genomes', 'help', + 'localReportDir', + 'local-report-dir', 'markdup_java_options', 'max_cpus', 'max_memory', diff --git a/nextflow.config b/nextflow.config index 9e2bdebdea..55fb177d2d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -67,6 +67,7 @@ profiles { includeConfig 'conf/igenomes.config' includeConfig 'conf/aws-batch.config' includeConfig 'conf/docker.config' + includeConfig 'conf/resources.config' includeConfig 'conf/containers.config' } // Small testing with Singularity profile