Merge branch 'dev' into kraken2-emptyfastq-fix

jfy133 · web-flow · commit 634ae165c25a · 2022-07-04T13:54:21.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,11 +7,15 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ### `Added`
 
+
 ### `Fixed`
 
 - [#882](https://github.com/nf-core/eager/pull/882) Define DSL1 execution explicitly, as new versions Nextflow made DSL2 default (♥ to & fix from @Lehmann-Fabian)
 - [#879](https://github.com/nf-core/eager/issues/879) Add missing threads parameter for pre-clipping FastQC for single end data that caused insufficient memory in some cases (♥ to @marcel-keller for reporting)
 - [#885](https://github.com/nf-core/eager/issues/885) Specify task memory for all tools in get_software_versions to account for incompatibilty of java with some SGE clusters causing hanging of the process (♥ to @maxibor for reporting)
+- [#887](https://github.com/nf-core/eager/issues/887) Clarify what is considered 'ultra-short' reads in the help text of clip_readlength, for when you may wish to turn of length filtering during AdapterRemoval (♥ to @TCLamnidis for reporting)
+- [#889](https://github.com/nf-core/eager/issues/889) Remove/updated parameters from benchmarking test profiles (♥ to @TCLamnidis for reporting)
+- [#895](https://github.com/nf-core/eager/issues/895) Output documentation typo fix and added location of output docs in pipeline summary (♥ to @RodrigoBarquera for reporting)
 - [#897](https://github.com/nf-core/eager/issues/897) Fix pipeline crashing if no Kraken2 results generated (♥ to @alexandregilardet for reporting)
 
 ### `Dependencies`
diff --git a/conf/benchmarking_human.config b/conf/benchmarking_human.config
@@ -12,26 +12,24 @@ params {
    config_profile_description = "A 'fullsized' benchmarking profile for deepish Human sequencing aDNA data" 
 
    //Input data
-   input = 'https://raw.githubusercontent.com/jfy133/test-datasets/eager/testdata/Benchmarking/benchmarking_human.tsv'
+   input = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Benchmarking/benchmarking_human.tsv'
    // Genome reference
    fasta = 'https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz'
 
    run_bam_filtering = true
-   bam_discard_unmapped = true
    bam_unmapped_type = 'discard'
    bam_mapping_quality_threshold = 30
 
    dedupper = 'markduplicates'
   
    run_trim_bam = true
-   bamutils_clip_left = 1
-   bamutils_clip_right = 1
+   bamutils_clip_double_stranded_none_udg_left = 1
+   bamutils_clip_double_stranded_none_udg_right = 1
    
    // JAR will need to be downloaded first!
    run_genotyping = true
    genotyping_tool = 'ug'
    genotyping_source = 'trimmed'
-   gatk_ug_jar = 'GenomeAnalysisTK.jar'
    gatk_call_conf = 20
 
    run_sexdeterrmine = true
@@ -41,8 +39,6 @@ params {
    contamination_chrom_name = 'chrX'
 
    run_mtnucratio = true
-
-
 }
 
 process {
diff --git a/conf/benchmarking_vikingfish.config b/conf/benchmarking_vikingfish.config
@@ -20,7 +20,6 @@ params {
    bwaalnl = 1024
    
    run_bam_filtering = true
-   bam_discard_unmapped = true
    bam_unmapped_type = 'discard'
    bam_mapping_quality_threshold = 25
      
diff --git a/docs/output.md b/docs/output.md
@@ -679,7 +679,7 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir
 * `fastqc/`: this contains the original per-FASTQ FastQC reports that are summarised with MultiQC. These occur in both `html` (the report) and `.zip` format (raw data). The `after_clipping` folder contains the same but for after AdapterRemoval.
 * `adapterremoval/`: this contains the log files (ending with `.settings`) with raw trimming (and merging) statistics after AdapterRemoval. In the `output` sub-directory, are the output trimmed (and merged) `fastq` files. These you can use for downstream applications such as taxonomic binning for metagenomic studies.
 * `post_ar_fastq_trimmed`: this contains `fastq` files that have been additionally trimmed after AdapterRemoval (if turned on). These reads are usually that had internal barcodes, or damage that needed to be removed before mapping.
-* `mapping/`: this contains a sub-directory corresponding to the mapping tool you used, inside of which will be the initial BAM files containing the reads that mapped to your reference genome with no modification (see below). You will also find a corresponding BAM index file (ending in `.csi` or `.bam`), and if running the `bowtie2` mapper: a log ending in `_bt2.log`. You can use these for downstream applications e.g. if you wish to use a different de-duplication tool not included in nf-core/eager (although please feel free to add a new module request on the Github repository's [issue page](https://github.com/nf-core/eager/issues)!).
+* `mapping/`: this contains a sub-directory corresponding to the mapping tool you used, inside of which will be the initial BAM files containing the reads that mapped to your reference genome with no modification (see below). You will also find a corresponding BAM index file (ending in `.csi` or `.bai`), and if running the `bowtie2` mapper: a log ending in `_bt2.log`. You can use these for downstream applications e.g. if you wish to use a different de-duplication tool not included in nf-core/eager (although please feel free to add a new module request on the Github repository's [issue page](https://github.com/nf-core/eager/issues)!).
 * `samtools/`: this contains two sub-directories. `stats/` contain the raw mapping statistics files (ending in `.stats`) from directly after mapping. `filter/` contains BAM files that have had a mapping quality filter applied (set by the `--bam_mapping_quality_threshold` flag) and a corresponding index file. Furthermore, if you selected `--bam_discard_unmapped`, you will find your separate file with only unmapped reads in the format you selected. Note unmapped read BAM files will _not_ have an index file.
 * `deduplication/`: this contains a sub-directory called `dedup/`, inside here are sample specific directories. Each directory contains a BAM file containing mapped reads but with PCR duplicates removed, a corresponding index file and two stats file. `.hist.` contains raw data for a deduplication histogram used for tools like preseq (see below), and the `.log` contains overall summary deduplication statistics.
 * `endorSpy/`: this contains all JSON files exported from the endorSpy endogenous DNA calculation tool. The JSON files are generated specifically for display in the MultiQC general statistics table and is otherwise very likely not useful for you.
diff --git a/main.nf b/main.nf
@@ -3320,6 +3320,7 @@ workflow.onComplete {
     if (workflow.success) {
         log.info "-${c_purple}[nf-core/eager]${c_green} Pipeline completed successfully${c_reset}-"
         log.info "-${c_purple}[nf-core/eager]${c_green} MultiQC run report can be found in ${params.outdir}/multiqc ${c_reset}-"
+        log.info "-${c_purple}[nf-core/eager]${c_green} Further output documentation can be seen at https://nf-core/eager/output ${c_reset}-"
     } else {
         checkHostname()
         log.info "-${c_purple}[nf-core/eager]${c_red} Pipeline completed with errors${c_reset}-"
diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -475,7 +475,7 @@
                     "default": 30,
                     "description": "Specify read minimum length to be kept for downstream analysis.",
                     "fa_icon": "fas fa-ruler",
-                    "help_text": "Defines the minimum read length that is required for reads after merging to be considered for downstream analysis after read merging. Default is `30`.\n\nNote that performing read length filtering at this step is not reliable for correct endogenous DNA calculation, when you have a large percentage of very short reads in your library - such as retrieved in single-stranded library protocols. When you have very few reads passing this length filter, it will artificially inflate your endogenous DNA by creating a very small denominator. In these cases it is recommended to set this to 0, and use `--bam_filter_minreadlength` instead, to filter out 'un-usable' short reads after mapping.\n\n> Modifies AdapterRemoval parameter: `--minlength`\n"
+                    "help_text": "Defines the minimum read length that is required for reads after merging to be considered for downstream analysis after read merging. Default is `30`.\n\nNote that when you have a large percentage of very short reads in your library (< 20 bp) - such as retrieved in single-stranded library protocols - that performing read length filtering at this step is not _always_ reliable for correct endogenous DNA calculation.  When you have very few reads passing this length filter, it will artificially inflate your 'endogenous DNA' value by creating a very small denominator. \n\nIf you notice you have ultra short reads (< 20 bp), it is recommended to set this parameter to 0, and use `--bam_filter_minreadlength` instead, to filter out 'un-usable' short reads after mapping. A caveat, however, is that this will cause a very large increase in computational run time, due to all reads in the library will be being mapped.\n\n> Modifies AdapterRemoval parameter: `--minlength`\n"
                 },
                 "clip_min_read_quality": {
                     "type": "integer",
@@ -1683,7 +1683,7 @@
                 "maltextract_percentidentity": {
                     "type": "number",
                     "description": "Minimum percent identity alignments are required to have to be reported. Recommended to set same as MALT parameter.",
-                    "default": 85.0,
+                    "default": 85,
                     "fa_icon": "fas fa-id-card",
                     "help_text": "Minimum percent identity alignments are required to have to be reported. Higher values allows fewer mismatches between read and reference sequence, but therefore will provide greater confidence in the hit. Lower values allow more mismatches, which can account for damage and divergence of a related strain/species to the reference. Recommended to set same as MALT parameter or higher. Default: `85.0`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--minPI`"
                 },