merge upstream/dev

FriederikeHanssen · FriederikeHanssen · commit 6a893cc641a8 · 2022-07-20T16:33:23.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,17 +5,9 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [2.7.2](https://github.com/nf-core/sarek/releases/tag/2.7.2) - Áhkká
-
-Áhkká is one of the massifs just outside of the Sarek National Park.
-
-### Fixed
-
-- [#566](https://github.com/nf-core/sarek/pull/566) - Fix caching bug affecting a variable number of `MapReads` jobs due to non-deterministic state of `statusMap` during caching evaluation
-
-## [2.7.1](https://github.com/nf-core/sarek/releases/tag/2.7.1) - Pårtejekna
+## [3.0](https://github.com/nf-core/sarek/releases/tag/3.0) - Skierfe
 
-Pårtejekna is one of glaciers of the Pårte Massif.
+Skierfe is a mountain in the Sarek national park, and the inspiration for the logo.
 
 ### Added
 
@@ -112,6 +104,7 @@ Pårtejekna is one of glaciers of the Pårte Massif.
 - [#659](https://github.com/nf-core/sarek/pull/659) - Update usage.md docu section on `How to run ASCAT with WES`
 - [#661](https://github.com/nf-core/sarek/pull/661) - Add cnvkit reference creation to index subway map
 - [#663](https://github.com/nf-core/sarek/pull/663) - Add separate parameters for `ASCAT` and `ControlFREEC` back in
+- [#668](https://github.com/nf-core/sarek/pull/668) - Update annotation documentation
 
 ### Fixed
 
@@ -164,6 +157,7 @@ Pårtejekna is one of glaciers of the Pårte Massif.
 - [#655](https://github.com/nf-core/sarek/pull/655) - Fix `--intervals false` logic & add versioning for local modules
 - [#658](https://github.com/nf-core/sarek/pull/658) - Fix split fastq names in multiqc-report
 - [#666](https://github.com/nf-core/sarek/pull/666) - Simplify multiqc config channel input
+- [#668](https://github.com/nf-core/sarek/pull/668) - Add `snpeff_version` and `vep_version` to `schema_ignore_params` to avoid issue when specifying on command line
 - [#669](https://github.com/nf-core/sarek/pull/669) - Fix path to files when creating csv files
 
 ### Deprecated
@@ -182,6 +176,14 @@ Pårtejekna is one of glaciers of the Pårte Massif.
 - [#605](https://github.com/nf-core/sarek/pull/605) - Removed Scatter/gather from GATK_SINGLE_SAMPLE_GERMLINE_VARIANT_CALLING, all intervals are processed together
 - [#643](https://github.com/nf-core/sarek/pull/643) - Removed Sentieon parameters
 
+## [2.7.2](https://github.com/nf-core/sarek/releases/tag/2.7.2) - Áhkká
+
+Áhkká is one of the massifs just outside of the Sarek National Park.
+
+### Fixed
+
+- [#566](https://github.com/nf-core/sarek/pull/566) - Fix caching bug affecting a variable number of `MapReads` jobs due to non-deterministic state of `statusMap` during caching evaluation
+
 ## [2.7.1](https://github.com/nf-core/sarek/releases/tag/2.7.1) - Pårtejekna
 
 Pårtejekna is one of glaciers of the Pårte Massif.
diff --git a/conf/test.config b/conf/test.config
@@ -44,7 +44,7 @@ params {
     vep_version       = '106.1'
 
     // Ignore params that will throw warning through params validation
-    schema_ignore_params = "genomes,test_data"
+    schema_ignore_params = 'genomes,test_data,snpeff_version,vep_version'
 }
 
 profiles {
diff --git a/conf/test_full.config b/conf/test_full.config
@@ -19,7 +19,6 @@ params {
 
     // Other params
     tools = 'strelka,freebayes,haplotypecaller,deepvariant,manta,tiddit,cnvkit,vep'
-    schema_ignore_params = 'genomes'
 
     split_fastq = 50000000
 }
diff --git a/conf/test_full_somatic.config b/conf/test_full_somatic.config
@@ -19,8 +19,7 @@ params {
 
     // Other params
     tools = 'strelka,mutect2,freebayes,ascat,manta,cnvkit,tiddit,controlfreec,vep'
-    schema_ignore_params = 'genomes'
-    wes = true
-    intervals = 's3://nf-core-awsmegatests/sarek/input/S07604624_Padded_Agilent_SureSelectXT_allexons_V6_UTR.bed'
     split_fastq = 20000000
+    intervals   = 's3://nf-core-awsmegatests/sarek/input/S07604624_Padded_Agilent_SureSelectXT_allexons_V6_UTR.bed'
+    wes         = true
 }
diff --git a/docs/usage.md b/docs/usage.md
@@ -482,8 +482,8 @@ If you have any questions or issues please send us a message on [Slack](https://
 
 When using default parameters only, sarek runs preprocessing and exits after base quality score recalibration. This is reflected in the default test profile:
 
-```
-nextflow run nf-core/sarek -r 3.0.0 -profile test,<container/institute>
+```console
+nextflow run nf-core/sarek -r 3.0 -profile test,<container/institute>
 ```
 
 Expected run output:
@@ -532,13 +532,13 @@ Expected run output:
 
 The pipeline comes with a number of possible paths and tools that can be used. The easiest and fastest test to see that the preprocessing + variantcalling (in this case Strelka2) works, is to run:
 
-```
+```console
 nextflow run nf-core/sarek -r 3.0.0 -profile test,<container/institute> --tools strelka
 ```
 
 Due to the small test data size, unfortunately not everything can be tested from top-to-bottom, but often is done by utilizing the pipeline's `--step` parameter. Annotation has to tested separatly from the remaining workflow, since we use references for `C.elegans`, while the remaining tests are run on downsampled human data.
 
-```
+```console
 nextflow run nf-core/sarek -r 3.0.0 -profile test,<container/institute> --tools snpeff --step annotation
 ```
 
@@ -611,7 +611,7 @@ In addition, currently the mismatch penalty for reads with tumor status in the s
 When plots are missing, it is possible that the fasta and the custom SnpEff database are not matching https://pcingola.github.io/SnpEff/se_faq/#error_chromosome_not_found-details.
 The SnpEff completes without throwing an error causing nextflow to complete successfully. An indication for the error are these lines in the `.command` files:
 
-```
+```text
 ERRORS: Some errors were detected
 Error type      Number of errors
 ERROR_CHROMOSOME_NOT_FOUND      17522411
@@ -627,7 +627,7 @@ If you have problems running processes that make use of Spark such as `MarkDupli
 You are probably experiencing issues with the limit of open files in your system.
 You can check your current limit by typing the following:
 
-```bash
+```console
 ulimit -n
 ```
 
@@ -636,20 +636,20 @@ In order to increase the size limit permanently you can:
 
 Edit the file `/etc/security/limits.conf` and add the lines:
 
-```bash
+```console
 *     soft   nofile  65535
 *     hard   nofile  65535
 ```
 
 Edit the file `/etc/sysctl.conf` and add the line:
 
-```bash
+```console
 fs.file-max = 65535
 ```
 
 Edit the file `/etc/sysconfig/docker` and add the new limits to OPTIONS like this:
 
-```bash
+```console
 OPTIONS=”—default-ulimit nofile=65535:65535"
 ```
 
@@ -681,6 +681,36 @@ Recent updates to Samtools have been introduced, which can speed-up performance
 The current workflow does not handle duplex UMIs (i.e. where opposite strands of a duplex molecule have been tagged with a different UMI), and best practices have been proposed to process this type of data.
 Both changes will be implemented in a future release.
 
+## How to run sarek when no(t all) reference files are in igenomes
+
+For common genomes, such as GRCh38 and GRCh37, the pipeline is shipped with (almost) all necessary reference files. However, sometimes it is necessary to use custom references for some or all files:
+
+### No igenomes reference files are used
+
+If none of your required genome files are in igenomes, `--igenomes_ignore` must be set to ignore any igenomes input and `--genome null`. The `fasta` file is the only required input file and must be provided to run the pipeline. All other possible reference file can be provided in addition. For details, see the paramter documentation.
+
+Minimal example for custom genomes:
+
+```console
+nextflow run nf-core/sarek --genome null --igenomes_ignore --fasta <custom.fasta>
+```
+
+### Overwrite specific reference files
+
+If you don't want to use some of the provided reference genomes, they can be overwritten by either providing a new file or setting the respective file parameter to `false`, if it should be ignored:
+
+Example for using a custom known indels file:
+
+```console
+nextflow run nf-core/sarek --known_indels <my_known_indels.vcf.gz> --genome GRCh38.GATK
+```
+
+Example for not using known indels, but all other provided reference file:
+
+```console
+nextflow run nf-core/sarek --known_indels false --genome GRCh38.GATK
+```
+
 ### Where do the used reference genomes originate from
 
 _under construction - help needed_
@@ -747,40 +777,7 @@ nextflow run nf-core/sarek --known_indels false --genome GRCh38.GATK
 
 ## How to customise SnpEff and VEP annotation
 
-_under construction help needed_
-
-Sarek comes shipped with containers for both snpEff and VEP for human reference genomes with `--genome GATK.GRCh38` and `--genome GATK.GRCh37`. Different containers however can be provided.
-
-<!-- #### Create containers
-
-The cache has to be downloaded.
-
-`sareksnpeff`, our `snpeff` container is designed using [Conda](https://conda.io/).
-
-[![sareksnpeff-docker status](https://img.shields.io/docker/automated/nfcore/sareksnpeff.svg)](https://hub.docker.com/r/nfcore/sareksnpeff)
-
-Based on [nfcore/base:1.12.1](https://hub.docker.com/r/nfcore/base/tags), it contains:
-
-- **[snpEff](http://snpeff.sourceforge.net/)** 4.3.1t
-- Cache for `GRCh37`, `GRCh38`, `GRCm38`, `CanFam3.1` or `WBcel235`
-
-`sarekvep`, our `vep` container is designed using [Conda](https://conda.io/).
-
-[![sarekvep-docker status](https://img.shields.io/docker/automated/nfcore/sarekvep.svg)](https://hub.docker.com/r/nfcore/sarekvep)
-
-Based on [nfcore/base:1.12.1](https://hub.docker.com/r/nfcore/base/tags), it contains:
-
-- **[GeneSplicer](https://ccb.jhu.edu/software/genesplicer/)** 1.0
-- **[VEP](https://github.com/Ensembl/ensembl-vep)** 99.2
-- Cache for `GRCh37`, `GRCh38`, `GRCm38`, `CanFam3.1` or `WBcel235` -->
-
-  <!-- "snpeff_db"
-  "snpeff_genome":
-  "snpeff_version":
-  "vep_genome":
-  "vep_species":
-  "vep_cache_version":
-  "vep_version": -->
+Sarek uses nf-core provided containers for both snpEff and VEP for several reference genomes ('CanFam3', 'GRCh37', 'GRCh38', 'GRCm38' and 'WBcel235').
 
 ### Using downloaded cache
 
@@ -790,56 +787,56 @@ You need to specify the cache directory using `--snpeff_cache` and `--vep_cache`
 
 Example:
 
-```bash
+```console
 nextflow run nf-core/sarek --tools snpEff --step annotate --sample <file.vcf.gz> --snpeff_cache </path/to/snpEff/cache>
 nextflow run nf-core/sarek --tools VEP --step annotate --sample <file.vcf.gz> --vep_cache </path/to/VEP/cache>
 ```
 
-### Download cache
-
-A `Nextflow` helper script has been designed to help downloading `snpEff` and `VEP` caches.
-Such files are meant to be shared between multiple users, so this script is mainly meant for people administrating servers, clusters and advanced users.
-
-```bash
-nextflow run download_cache.nf --snpeff_cache </path/to/snpEff/cache> --snpeff_db <snpEff DB version> --genome <GENOME>
-nextflow run download_cache.nf --vep_cache </path/to/VEP/cache> --species <species> --vep_cache_version <VEP cache version> --genome <GENOME>
-```
+Similarly, when wanting to use a different cache than the one specified in the iGenomes config file, one can use `--snpeff_db`, `--snpeff_genome`, `--snpeff_version`, `--vep_cache_version`, `--vep_genome`, `--vep_species` and `--vep_version` to overwrite these default value related to the databases, genomes, versions and caches' versions used by these tools.
 
 ### Using VEP plugins
 
-<!-- To enable the use of the `VEP` `CADD` plugin:
+#### dbnsfp
 
-- Download the `CADD` files
-- Specify them (either on the command line, like in the example or in a configuration file)
-- use the `--cadd_cache` flag
+Enable with `--vep_dbnsfp`. The following parameters are mandatory:
 
-Example:
+- `--dbnsfp`, to specify the path to the dbNSFP processed file.
+- `--dbnsfp_tbi`, to specify the path to the dbNSFP tabix indexed file.
 
-```bash
-nextflow run nf-core/sarek --step annotate --tools VEP --sample <file.vcf.gz> --cadd_cache \
-    --cadd_indels </path/to/CADD/cache/InDels.tsv.gz> \
-    --cadd_indels_tbi </path/to/CADD/cache/InDels.tsv.gz.tbi> \
-    --cadd_wg_snvs </path/to/CADD/cache/whole_genome_SNVs.tsv.gz> \
-    --cadd_wg_snvs_tbi </path/to/CADD/cache/whole_genome_SNVs.tsv.gz.tbi>
-```
+The following parameters are optionnal:
 
-#### Downloading CADD files
+- `--dbnsfp_consequence`, to filter/limit outputs to a specific effect of the variant.
+  - The set of consequence terms is defined by the Sequence Ontology and an overview of those used in VEP can be found [here](https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html).
+  - If one wants to filter using several consequences, then separate those by using '&' (i.e. `--dbnsfp_consequence '3_prime_UTR_variant&intron_variant'`.",
+- `--dbnsfp_fields`, to retrieve individual values from the dbNSFP file.
+  - The values correspond to the name of the columns in the dbNSFP file and are separated by comma.
+  - The column names might differ between the different dbNSFP versions. Please check the Readme.txt file, which is provided with the dbNSFP file, to obtain the correct column names. The Readme file contains also a short description of the provided values and the version of the tools used to generate them.
 
-An helper script has been designed to help downloading `CADD` files.
-Such files are meant to be share between multiple users, so this script is mainly meant for people administrating servers, clusters and advanced users.
+For more details, see [here](https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#dbnsfp).
 
-```bash
-nextflow run download_cache.nf --cadd_cache </path/to/CADD/cache> --cadd_version <CADD version> --genome <GENOME>
-``` -->
+#### LOFTEE
 
-#### dbnsfp
+Enable with `--vep_loftee`.
 
-#### LOFTEE
+For more details, see [here](https://github.com/konradjk/loftee).
 
 #### SpliceAi
 
+Enable with `--vep_spliceai`. The following parameters are mandatory:
+
+- `--spliceai_snv`, to specify the path to SpliceAI raw scores snv file.
+- `--spliceai_snv_tbi`, to specify the path to SpliceAI raw scores snv tabix indexed file.
+- `--spliceai_indel`, to specify the path to SpliceAI raw scores indel file.
+- `--spliceai_indel_tbi`, to specify the path to SpliceAI raw scores indel tabix indexed file.
+
+For more details, see [here](https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#spliceai).
+
 #### SpliceRegions
 
+Enable with `--vep_spliceregion`.
+
+For more details, see [here](https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#spliceregion) and [here](https://www.ensembl.info/2018/10/26/cool-stuff-the-vep-can-do-splice-site-variant-annotation/)."
+
 ## Requested resources for the tools
 
 Resource requests are difficult to generalize and are often dependent on input data size. Currently, the number of cpus and memory requested by default were adapted from tests on 5 ICGC paired whole-genome sequencing samples with approximately 40X and 80X depth.
diff --git a/nextflow.config b/nextflow.config
@@ -99,7 +99,7 @@ params {
     help                       = false
     validate_params            = true
     show_hidden_params         = false
-    schema_ignore_params       = 'genomes'
+    schema_ignore_params       = 'genomes,snpeff_version,vep_version'
     enable_conda               = false
 
     // Config options
@@ -182,8 +182,8 @@ profiles {
         podman.enabled         = false
         shifter.enabled        = false
     }
-    test      { includeConfig 'conf/test.config' }
-    test_full { includeConfig 'conf/test_full.config' }
+    test              { includeConfig 'conf/test.config' }
+    test_full         { includeConfig 'conf/test_full.config' }
     test_full_somatic { includeConfig 'conf/test_full_somatic.config' }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@ params {`
`44`	`44`	`vep_version = '106.1'`
`45`	`45`
`46`	`46`	`// Ignore params that will throw warning through params validation`
`47`		`- schema_ignore_params = "genomes,test_data"`
	`47`	`+ schema_ignore_params = 'genomes,test_data,snpeff_version,vep_version'`
`48`	`48`	`}`
`49`	`49`
`50`	`50`	`profiles {`
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,6 @@ params {`
`19`	`19`
`20`	`20`	`// Other params`
`21`	`21`	`tools = 'strelka,freebayes,haplotypecaller,deepvariant,manta,tiddit,cnvkit,vep'`
`22`		`- schema_ignore_params = 'genomes'`
`23`	`22`
`24`	`23`	`split_fastq = 50000000`
`25`	`24`	`}`