diff --git a/CHANGELOG.md b/CHANGELOG.md index 738f285d60..1c69625f20 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#671](https://github.com/SciLifeLab/Sarek/pull/671) - New `publishDirMode` param and docs - [#673](https://github.com/SciLifeLab/Sarek/pull/673), [#675](https://github.com/SciLifeLab/Sarek/pull/675), [#676](https://github.com/SciLifeLab/Sarek/pull/676) - Profiles for BinAC and CFC clusters in Tübingen - [#679](https://github.com/SciLifeLab/Sarek/pull/679) - Add container for `CreateIntervalBeds` -- [#692](https://github.com/SciLifeLab/Sarek/pull/692) - Add AWS iGenomes possibilities (currently under `iGRCh37` and `iGRCh38`) +- [#692](https://github.com/SciLifeLab/Sarek/pull/692), [#697](https://github.com/SciLifeLab/Sarek/pull/697) - Add AWS iGenomes possibilities (within `conf/igenomes.conf`) - [#694](https://github.com/SciLifeLab/Sarek/pull/694) - Add monochrome and grey logos for light or dark background ### `Changed` diff --git a/conf/aws-batch.config b/conf/aws-batch.config index 2c8001e619..0293d684bb 100644 --- a/conf/aws-batch.config +++ b/conf/aws-batch.config @@ -8,7 +8,7 @@ */ params { - genome_base = params.genome == 'iGRCh37' ? "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh37" : params.genome == 'iGRCh38' ? "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh38" : "s3://sarek-references/small" + genome_base = params.genome == 'GRCh37' ? "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh37" : params.genome == 'GRCh38' ? "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh38" : "s3://sarek-references/small" publishDirMode = 'copy' } diff --git a/conf/base.config b/conf/base.config index 9864cde1d4..3b19264e30 100644 --- a/conf/base.config +++ b/conf/base.config @@ -6,7 +6,6 @@ * ------------------------------------------------- */ -includeConfig 'genomes.config' wf_repository = 'maxulysse' params { diff --git a/conf/genomes.config b/conf/genomes.config index 95bd91ff83..1928b780dd 100644 --- a/conf/genomes.config +++ b/conf/genomes.config @@ -4,10 +4,12 @@ * ------------------------------------------------- * Path to reference files * ------------------------------------------------- - * Imported under all Nextflow profiles in + * Imported under Nextflow profiles in * nextflow.config * ------------------------------------------------- - * Modify to add specific versions of genomes + * Defines reference genomes, using paths + * Can be used by any config that customises the base + * path using $params.genome_base / --genome_base * ------------------------------------------------- */ @@ -42,32 +44,6 @@ params { //AF_files = "${params.genome_base}/{00-All.dbsnp_151.hg38.CAF.TOPMED.alternate.allele.freq,hapmap_3.3_grch38_pop_stratified_af.HMAF,SweGen_hg38_stratified.SWAF}.vcf" //AF_indexes = "${params.genome_base}/{00-All.dbsnp_151.hg38.CAF.TOPMED.alternate.allele.freq,hapmap_3.3_grch38_pop_stratified_af.HMAF,SweGen_hg38_stratified.SWAF}.vcf.idx" } - 'iGRCh37' { - acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_20130502_SNP_maf0.3.loci" - dbsnp = "${params.genome_base}/Annotation/GATKBundle/dbsnp_138.b37.vcf" - dbsnpIndex = "${params.genome_base}/Annotation/GATKBundle/dbsnp_138.b37.vcf.idx" - genomeFile = "${params.genome_base}/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta" - genomeDict = "${params.genome_base}/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.dict" - genomeIndex = "${params.genome_base}/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta.fai" - bwaIndex = "${params.genome_base}/Sequence/BWAIndex/human_g1k_v37_decoy.fasta.{amb,ann,bwt,pac,sa}" - intervals = "${params.genome_base}/Annotation/intervals/wgs_calling_regions_CAW.list" - knownIndels = "${params.genome_base}/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf" - knownIndelsIndex = "${params.genome_base}/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf.idx" - snpeffDb = "GRCh37.75" - } - 'iGRCh38' { - acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_GRCh38_maf0.3.loci" - dbsnp = "${params.genome_base}/Annotation/GATKBundle/dbsnp_146.hg38.vcf.gz" - dbsnpIndex = "${params.genome_base}/Annotation/GATKBundle/dbsnp_146.hg38.vcf.gz.tbi" - genomeFile = "${params.genome_base}/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta" - genomeDict = "${params.genome_base}/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.dict" - genomeIndex = "${params.genome_base}/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta.fai" - bwaIndex = "${params.genome_base}/Sequence/BWAIndex/Homo_sapiens_assembly38.fasta.64.{alt,amb,ann,bwt,pac,sa}" - intervals = "${params.genome_base}/Annotation/intervals/wgs_calling_regions.hg38.bed" - knownIndels = "${params.genome_base}/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz" - knownIndelsIndex = "${params.genome_base}/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz.tbi" - snpeffDb = "GRCh38.86" - } 'smallGRCh37' { acLoci = "${params.genome_base}/1000G_phase3_20130502_SNP_maf0.3.small.loci" dbsnp = "${params.genome_base}/dbsnp_138.b37.small.vcf" diff --git a/conf/igenomes.config b/conf/igenomes.config new file mode 100644 index 0000000000..e3d7077856 --- /dev/null +++ b/conf/igenomes.config @@ -0,0 +1,58 @@ +/* + * ------------------------------------------------- + * Nextflow config file for Sarek + * ------------------------------------------------- + * Path to iGenomes reference files + * ------------------------------------------------- + * Imported under Nextflow profiles in + * nextflow.config + * ------------------------------------------------- + * Defines reference genomes, using iGenome paths + * Can be used by any config that customises the base + * path using $params.genome_base / --genome_base + * ------------------------------------------------- + */ + +params { + genomes { + 'GRCh37' { + acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_20130502_SNP_maf0.3.loci" + dbsnp = "${params.genome_base}/Annotation/GATKBundle/dbsnp_138.b37.vcf" + dbsnpIndex = "${params.genome_base}/Annotation/GATKBundle/dbsnp_138.b37.vcf.idx" + genomeFile = "${params.genome_base}/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta" + genomeDict = "${params.genome_base}/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.dict" + genomeIndex = "${params.genome_base}/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta.fai" + bwaIndex = "${params.genome_base}/Sequence/BWAIndex/human_g1k_v37_decoy.fasta.{amb,ann,bwt,pac,sa}" + intervals = "${params.genome_base}/Annotation/intervals/wgs_calling_regions_CAW.list" + knownIndels = "${params.genome_base}/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf" + knownIndelsIndex = "${params.genome_base}/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf.idx" + snpeffDb = "GRCh37.75" + } + 'GRCh38' { + acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_GRCh38_maf0.3.loci" + dbsnp = "${params.genome_base}/Annotation/GATKBundle/dbsnp_146.hg38.vcf.gz" + dbsnpIndex = "${params.genome_base}/Annotation/GATKBundle/dbsnp_146.hg38.vcf.gz.tbi" + genomeFile = "${params.genome_base}/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta" + genomeDict = "${params.genome_base}/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.dict" + genomeIndex = "${params.genome_base}/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta.fai" + bwaIndex = "${params.genome_base}/Sequence/BWAIndex/Homo_sapiens_assembly38.fasta.64.{alt,amb,ann,bwt,pac,sa}" + intervals = "${params.genome_base}/Annotation/intervals/wgs_calling_regions.hg38.bed" + knownIndels = "${params.genome_base}/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz" + knownIndelsIndex = "${params.genome_base}/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz.tbi" + snpeffDb = "GRCh38.86" + } + 'smallGRCh37' { + acLoci = "${params.genome_base}/1000G_phase3_20130502_SNP_maf0.3.small.loci" + dbsnp = "${params.genome_base}/dbsnp_138.b37.small.vcf" + dbsnpIndex = "${dbsnp}.idx" + genomeFile = "${params.genome_base}/human_g1k_v37_decoy.small.fasta" + bwaIndex = "${genomeFile}.{amb,ann,bwt,pac,sa}" + genomeDict = "${params.genome_base}/human_g1k_v37_decoy.small.dict" + genomeIndex = "${genomeFile}.fai" + intervals = "${params.genome_base}/small.intervals" + knownIndels = "${params.genome_base}/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.small.vcf" + knownIndelsIndex = "${params.genome_base}/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.small.vcf.idx" + snpeffDb = "GRCh37.75" + } + } +} diff --git a/docs/REFERENCES.md b/docs/REFERENCES.md index eb6e21c6ea..0ef1ce94c4 100644 --- a/docs/REFERENCES.md +++ b/docs/REFERENCES.md @@ -1,11 +1,13 @@ # Genomes and reference files -Sarek currently uses GRCh38 by default. The settings are in `genomes.config`, they can be tailored to your needs. +Sarek currently uses GRCh38 by default. +The settings are in `genomes.config`, they can be tailored to your needs. The [`buildReferences.nf`](#buildreferencesnf) script is used to build the indexes for the reference test. ## GRCh37 -Use `--genome GRCh37` to map against GRCh37. Before doing so and if you are not on UPPMAX, you need to adjust the settings in `genomes.config` to your needs. +Use `--genome GRCh37` to map against GRCh37. +Before doing so and if you are not on UPPMAX, you need to adjust the settings in `genomes.config` to your needs. ### GATK bundle @@ -20,7 +22,8 @@ The following files need to be downloaded: ### Other files for GRCh37 -From our repo, get the [`intervals` list file](https://raw.githubusercontent.com/SciLifeLab/Sarek/master/repeats/wgs_calling_regions.grch37.list). More information about this file in the [intervals documentation](INTERVALS.md) +From our repo, get the [`intervals` list file](https://raw.githubusercontent.com/SciLifeLab/Sarek/master/repeats/wgs_calling_regions.grch37.list). +More information about this file in the [intervals documentation](INTERVALS.md) Description of how to generate the Loci file used in the ASCAT process is described [here](https://github.com/SciLifeLab/Sarek/blob/master/docs/ASCAT.md). @@ -28,13 +31,18 @@ You can create your own cosmic reference for any human reference as specified be ## GRCh38 -Use `--genome GRCh38` to map against GRCh38. Before doing so and if you are not on UPPMAX, you need to adjust the settings in `genomes.config` to your needs. +Use `--genome GRCh38` to map against GRCh38. +Before doing so and if you are not on UPPMAX, you need to adjust the settings in `genomes.config` to your needs. -To get the needed files, download the GATK bundle for GRCh38 from [ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/](ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/). You can also download the required files from the Google Cloud mirror link [here](https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0). +To get the needed files, download the GATK bundle for GRCh38 from [ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/](ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/). +You can also download the required files from the Google Cloud mirror link [here](https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0). The MD5SUM of `Homo_sapiens_assembly38.fasta` included in that file is 7ff134953dcca8c8997453bbb80b6b5e. -If you download the data from the FTP servers `beta/` directory, which seems to be an older version of the bundle, only `Homo_sapiens_assembly38.known_indels.vcf` is needed. Also, you can omit `dbsnp_138_` and `dbsnp_144` files as we use `dbsnp_146`. The old ones also use the wrong chromosome naming convention. The Google Cloud mirror has all data in the `v0` directory, but requires you to remove the `resources_broad_hg38_v0_` prefixes from all files. +If you download the data from the FTP servers `beta/` directory, which seems to be an older version of the bundle, only `Homo_sapiens_assembly38.known_indels.vcf` is needed. +Also, you can omit `dbsnp_138_` and `dbsnp_144` files as we use `dbsnp_146`. +The old ones also use the wrong chromosome naming convention. +The Google Cloud mirror has all data in the `v0` directory, but requires you to remove the `resources_broad_hg38_v0_` prefixes from all files. The following files need to be downloaded: @@ -68,7 +76,8 @@ You can create your own cosmic reference for any human reference as specified be To annotate with COSMIC variants during MuTect1/2 Variant Calling you need to create a compatible VCF file. Download the coding and non-coding VCF files from [COSMIC](http://cancer.sanger.ac.uk/cosmic/download) and process them with the [Create\_Cosmic.sh](https://github.com/SciLifeLab/Sarek/tree/master/scripts/Create_Cosmic.sh) -script for either GRCh37 or GRCh38. The script requires a fasta index `.fai`, of the reference file you are using. +script for either GRCh37 or GRCh38. +The script requires a fasta index `.fai`, of the reference file you are using. Example: @@ -87,11 +96,12 @@ igvtools index ## smallGRCh37 -Use `--genome smallGRCh37` to map against a small reference genome based on GRCh37. `smallGRCh37` is the default genome for the testing profile (`-profile testing`). +Use `--genome smallGRCh37` to map against a small reference genome based on GRCh37. +`smallGRCh37` is the default genome for the testing profile (`-profile testing`). ## AWS iGenomes Sarek is using [AWS iGenomes](https://ewels.github.io/AWS-iGenomes/), which facilitate storing and sharing references. -Both `GRCh37` and `GRCh38` are available with `--genome iGRCh37` or `--genome iGRCh38` respectively, it contains all data previously detailed. +Both `GRCh37` and `GRCh38` are available with `--genome GRCh37` or `--genome GRCh38` respectively with any profile using the `conf/igenomes.config` file (eg.: `awsbatch`), or you can specify it with `-c conf/igenomes.config`, it contains all data previously detailed. ## buildReferences.nf diff --git a/nextflow.config b/nextflow.config index ddffe4b7f3..9e2bdebdea 100644 --- a/nextflow.config +++ b/nextflow.config @@ -28,6 +28,7 @@ profiles { // Singularity images need to be set up standard { includeConfig 'conf/base.config' + includeConfig 'conf/genomes.config' includeConfig 'conf/uppmax-localhost.config' includeConfig 'conf/singularity-path.config' } @@ -36,6 +37,7 @@ profiles { // Singularity images need to be set up slurm { includeConfig 'conf/base.config' + includeConfig 'conf/genomes.config' includeConfig 'conf/uppmax-slurm.config' includeConfig 'conf/singularity-path.config' } @@ -44,6 +46,7 @@ profiles { // Singularity images will be pulled automatically slurmDownload { includeConfig 'conf/base.config' + includeConfig 'conf/genomes.config' includeConfig 'conf/uppmax-slurm.config' includeConfig 'conf/singularity.config' includeConfig 'conf/containers.config' @@ -52,6 +55,7 @@ profiles { // Docker images will be pulled automatically docker { includeConfig 'conf/base.config' + includeConfig 'conf/genomes.config' includeConfig 'conf/travis.config' includeConfig 'conf/docker.config' includeConfig 'conf/containers.config' @@ -60,6 +64,7 @@ profiles { // Docker images will be pulled automatically awsbatch { includeConfig 'conf/base.config' + includeConfig 'conf/igenomes.config' includeConfig 'conf/aws-batch.config' includeConfig 'conf/docker.config' includeConfig 'conf/containers.config' @@ -68,6 +73,7 @@ profiles { // Singularity images will be pulled automatically singularity { includeConfig 'conf/base.config' + includeConfig 'conf/genomes.config' includeConfig 'conf/travis.config' includeConfig 'conf/singularity.config' includeConfig 'conf/containers.config' @@ -76,6 +82,7 @@ profiles { // Singularity images need to be set up singularityPath { includeConfig 'conf/base.config' + includeConfig 'conf/genomes.config' includeConfig 'conf/travis.config' includeConfig 'conf/singularity-path.config' } @@ -85,14 +92,16 @@ profiles { // Singularity images will be pulled automatically binac { includeConfig 'conf/base.config' + includeConfig 'conf/genomes.config' includeConfig 'conf/binac.config' includeConfig 'conf/singularity.config' includeConfig 'conf/resources.config' includeConfig 'conf/containers.config' } - // Default config for CFC cluster in Tuebingen/Germany + // Default config for CFC cluster in Tuebingen/Germany cfc { includeConfig 'conf/base.config' + includeConfig 'conf/genomes.config' includeConfig 'conf/cfc.config' includeConfig 'conf/singularity.config' includeConfig 'conf/resources.config' @@ -132,4 +141,4 @@ def check_max(obj, type) { return obj } } -} \ No newline at end of file +}