Merge branch 'dev' into arfp-pigz-fix

jfy133 · web-flow · commit 01a89a0f003f · 2022-03-17T16:51:55.000+01:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -201,6 +201,10 @@ jobs:
       - name: METAGENOMIC Run the basic pipeline but with unmapped reads going into Kraken
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_kraken,docker --run_bam_filtering  --bam_unmapped_type 'fastq'
+      - name: SNPCAPTURE Run the basic pipeline with the bam input profile, generating statistics with a SNP capture bed
+        run: |
+          wget https://github.com/nf-core/test-datasets/raw/eager/reference/Human/1240K.pos.list_hs37d5.0based.bed.gz && gunzip 1240K.pos.list_hs37d5.0based.bed.gz
+          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_humanbam,docker --skip_fastqc --skip_adapterremoval --skip_deduplication --snpcapture_bed 1240K.pos.list_hs37d5.0based.bed
       - name: SEXDETERMINATION Run the basic pipeline with the bam input profile, but don't convert BAM, skip everything but sex determination
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_humanbam,docker --skip_fastqc --skip_adapterremoval --skip_deduplication --skip_qualimap --run_sexdeterrmine
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,10 +9,19 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ### `Fixed`
 
+- [#828](https://github.com/nf-core/eager/issues/828) Improved error message if required metagenomic screening parameters not set correctly
+- [#836](https://github.com/nf-core/eager/issues/836) Remove deprecated parameters from test profiles
+- [#838](https://github.com/nf-core/eager/issues/836) Fix --snpcapture_bed files not being picked up by Nextflow
 - [#843](https://github.com/nf-core/eager/issues/843) Re-add direct piping of AdapterRemovalFixPrefix to pigz
+- [#845](https://github.com/nf-core/eager/issues/845) Updates parameter documention to specify `-s` preseq parameter also applies to lc_extrap
+- [#851](https://github.com/nf-core/eager/issues/851) Fixes a file-name clash during additional_library_merge, post-BAM trimming of different UDG treated libraries of a sample
+- Fix PMDtools reference mask not being picked up by Nextflow, and it's use being evaluated against --snpcapture_bed rather than --pmdtools_reference_mask
+- Renamed a range of MultiQC general stats table headers to improve clarity, documentation has been updated accordingly
 
 ### `Dependencies`
 
+- Bumped MultiQC: 1.11 -> 1.12 (for run-time optimisation and tool citation information)
+
 ### `Deprecated`
 
 ## [2.4.2] - 2022-01-24
diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml
@@ -61,43 +61,43 @@ extra_fn_clean_exts:
 
 top_modules:
     - 'fastqc':
-       name: 'FastQC (pre-Trimming)'
-       path_filters:
-           - '*_raw_fastqc.zip'
+        name: 'FastQC (pre-Trimming)'
+        path_filters:
+            - '*_raw_fastqc.zip'
     - 'fastp'
     - 'adapterRemoval'
     - 'fastqc':
-       name: 'FastQC (post-Trimming)'
-       path_filters:
+        name: 'FastQC (post-Trimming)'
+        path_filters:
             - '*.truncated_fastqc.zip'
             - '*.combined*_fastqc.zip'
     - 'bowtie2':
-       path_filters:
+        path_filters:
             - '*_bt2.log'
     - 'malt'
     - 'hops'
     - 'kraken'
     - 'samtools':
-       name: 'Samtools Flagstat (pre-samtools filter)'
-       path_filters:
+        name: 'Samtools Flagstat (pre-samtools filter)'
+        path_filters:
             - '*_flagstat.stats'
     - 'samtools':
-       name: 'Samtools Flagstat (post-samtools filter)'
-       path_filters:
+        name: 'Samtools Flagstat (post-samtools filter)'
+        path_filters:
             - '*_postfilterflagstat.stats'
     - 'dedup'
     - 'picard'
     - 'preseq':
-       path_filters:
-           - '*.preseq'
+        path_filters:
+            - '*.preseq'
     - 'damageprofiler'
     - 'mtnucratio'
     - 'qualimap'
     - 'sexdeterrmine'
     - 'bcftools'
     - 'multivcfanalyzer':
-       path_filters:
-           - '*MultiVCFAnalyzer.json'
+        path_filters:
+            - '*MultiVCFAnalyzer.json'
 qualimap_config:
     general_stats_coverage:
         - 1
@@ -107,7 +107,7 @@ qualimap_config:
         - 5
 
 remove_sections:
-  - sexdeterrmine-snps
+    - sexdeterrmine-snps
 
 table_columns_visible:
     FastQC (pre-Trimming):
@@ -272,5 +272,45 @@ report_section_order:
         order: -1000
     nf-core-eager-summary:
         order: -1001
-
 export_plots: true
+table_columns_name:
+    FastQC (pre-Trimming):
+        total_sequences: "Nr. Input Reads"
+        avg_sequence_length: "Length Input Reads"
+        percent_gc: "% GC Input Reads"
+        percent_duplicates: "% Dups Input Reads"
+        percent_fails: "% Failed Input Reads"
+    FastQC (post-Trimming):
+        total_sequences: "Nr. Processed Reads"
+        avg_sequence_length: "Length Processed Reads"
+        percent_gc: "% GC Processed Reads"
+        percent_duplicates: "% Dups Processed Reads"
+        percent_fails: "%Failed Processed Reads"
+    Samtools Flagstat (pre-samtools filter):
+        flagstat_total: "Nr. Reads Into Mapping"
+        mapped_passed: "Nr. Mapped Reads"
+    Samtools Flagstat (post-samtools filter):
+        flagstat_total: "Nr. Mapped Reads Post-Filter"
+        mapped_passed: "Nr. Mapped Reads Passed Post-Filter"
+    Endogenous DNA Post (%):
+        endogenous_dna_post (%): "Endogenous DNA Post-Filter (%)"
+    Picard:
+        PERCENT_DUPLICATION: "% Dup. Mapped Reads"
+    DamageProfiler:
+        mean_readlength: "Mean Length Mapped Reads"
+        median_readlength: "Median Length Mapped Reads"
+    QualiMap:
+        mapped_reads: "Nr. Dedup. Mapped Reads"
+        total_reads: "Nr. Dedup. Total Reads"
+        avg_gc: "% GC Dedup. Mapped Reads"
+    Bcftools Stats:
+        number_of_records: "Nr. Overall Variants"
+        number_of_SNPs: "Nr. SNPs"
+        number_of_indels: "Nr. InDels"
+    MALT:
+        Mappability: "% Metagenomic Mappability"
+    SexDetErrmine:
+        RateErrX: "SexDet Err X Chr"
+        RateErrY: "SexDet Err Y Chr"
+        RateX: "SexDet Rate X Chr"
+        RateY: "SexDet Rate Y Chr"
diff --git a/conf/test_full.config b/conf/test_full.config
@@ -21,7 +21,6 @@ params {
    bwaalnl = 1024
    
    run_bam_filtering = true
-   bam_discard_unmapped = true
    bam_unmapped_type = 'discard'
    bam_mapping_quality_threshold = 25
      
diff --git a/conf/test_stresstest_human.config b/conf/test_stresstest_human.config
@@ -24,7 +24,6 @@ params {
    mtnucratio_header = 'ChrM'
 
    run_bam_filtering = true
-   bam_discard_unmapped = true
    bam_unmapped_type = 'discard'
    bam_mapping_quality_threshold = 30
 
diff --git a/docs/output.md b/docs/output.md
@@ -59,36 +59,37 @@ This table will report values per-file, library, or sample statistics depending
 
 Each column name is supplied by the module, so you may see similar column names. When unsure, hovering over the column name will allow you see which module it is derived from.
 
-The possible columns displayed by default are as follows:
+The possible columns displayed by default are as follows (note you may see additional columns depending on what other modules you activate):
 
 * **Sample Name** This is the log file name without file suffix(s). This will depend on the module outputs.
-* **Seqs** This is from Pre-AdapterRemoval FastQC. Represents the number of raw reads in your untrimmed and (paired end) unmerged FASTQ file. Each row should be approximately equal to the number of reads you requested to be sequenced, divided by the number of FASTQ files you received for that library.
-* **Length** This is from Pre-AdapterRemoval FastQC. This is the average read length in your untrimmed and (paired end) unmerged FASTQ file and should represent the number of cycles of your sequencing chemistry.
-* **%GC** This is from Pre-AdapterRemoval FastQC. This is the average GC content in percent of all the reads in your untrimmed and (paired end) unmerged FASTQ file.
+* **Nr. Input Reads** This is from Pre-AdapterRemoval FastQC. Represents the number of raw reads in your untrimmed and (paired end) unmerged FASTQ file. Each row should be approximately equal to the number of reads you requested to be sequenced, divided by the number of FASTQ files you received for that library.
+* **Length Input Reads** This is from Pre-AdapterRemoval FastQC. This is the average read length in your untrimmed and (paired end) unmerged FASTQ file and should represent the number of cycles of your sequencing chemistry.
+* **% GC Input Reads** This is from Pre-AdapterRemoval FastQC. This is the average GC content in percent of all the reads in your untrimmed and (paired end) unmerged FASTQ file.
 * **GC content** This is from FastP. This is the average GC of all reads in your untrimmed and unmerged FASTSQ file after poly-G tail trimming. If you have lots of tails, this value should drop from the pre-AdapterRemoval FastQC  %GC column.
 * **% Trimmed** This is from AdapterRemoval. It is the percentage of reads which had an adapter sequence removed from the end of the read.
-* **Seqs** This is from Post-AdapterRemoval FastQC. Represents the number of preprocessed reads in your adapter trimmed (paired end) merged FASTQ file. The loss between this number and the Pre-AdapterRemoval FastQC can give you an idea of the quality of trimming and merging.
-* **%GC** This is from Post-AdapterRemoval FastQC. Represents the average GC of all preprocessed reads in your adapter trimmed (paired end) merged FASTQ file.
-* **Length** This is from post-AdapterRemoval FastQC. This is the average read length in your trimmed and (paired end) merged FASTQ file and should represent the 'realistic' average lengths of your DNA molecules
+* **Nr. Processed Reads** This is from Post-AdapterRemoval FastQC. Represents the number of preprocessed reads in your adapter trimmed (paired end) merged FASTQ file. The loss between this number and the Pre-AdapterRemoval FastQC can give you an idea of the quality of trimming and merging.
+* **% GC Processed Reads** This is from Post-AdapterRemoval FastQC. Represents the average GC of all preprocessed reads in your adapter trimmed (paired end) merged FASTQ file.
+* **Length Processed Reads** This is from post-AdapterRemoval FastQC. This is the average read length in your trimmed and (paired end) merged FASTQ file and should represent the 'realistic' average lengths of your DNA molecules
 * **% Aligned** This is from bowtie2. It reports the percentage of input reads that mapped to your reference genome. This number will be likely similar to Endogenous DNA % (see below).
-* **Mappability** This is from MALT. It reports the percentage of the off-target reads (from mapping), that could map to your MALT metagenomic database. This can often be low for aDNA due to short reads and database bias.
+* **% Metagenomic Mappability** This is from MALT. It reports the percentage of the off-target reads (from mapping), that could map to your MALT metagenomic database. This can often be low for aDNA due to short reads and database bias.
 * **% Unclassified** This is from Kraken. It reports the percentage of reads that could not be aligned and taxonomically assigned against your Kraken metagenomic database. This can often be high for aDNA due to short reads and database bias.
-* **Reads Mapped** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _prior_ map quality filtering.
+* **Nr. Reads Into Mapping** This is from Samtools. This is the raw number of preprocessed reads that went into mapping.
+* **Nr. Mapped Reads** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _prior_ map quality filtering.
 * **Endogenous DNA (%)** This is from the endorS.py tool. It displays a percentage of mapped reads over total reads that went into mapped (i.e. the percentage DNA content of the library that matches the reference). Assuming a perfect ancient sample with no modern contamination, this would be the amount of true ancient DNA in the sample. However this value _most likely_ include contamination and will not entirely be the true 'endogenous' content.
-* **Reads Mapped** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _after_ map quality filtering (note the column name does not distinguish itself from prior-map quality filtering, but the post-filter column is always second)
-* **Endogenous DNA Post (%)** This is from the endorS.py tool. It displays a percentage of mapped reads _after_ BAM filtering (i.e. for mapping quality and/or bam-level length filtering) over total reads that went into mapped (i.e. the percentage DNA content of the library that matches the reference). This column will only be displayed if BAM filtering is turned on and is based on the original mapping for total reads, and mapped reads as calculated from the post-filtering BAM.
+* **Nr. Mapped Reads Post-Filter** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _after_ map quality filtering (note the column name does not distinguish itself from prior-map quality filtering, but the post-filter column is always second)
+* **Endogenous DNA Post-Filter (%)** This is from the endorS.py tool. It displays a percentage of mapped reads _after_ BAM filtering (i.e. for mapping quality and/or bam-level length filtering) over total reads that went into mapped (i.e. the percentage DNA content of the library that matches the reference). This column will only be displayed if BAM filtering is turned on and is based on the original mapping for total reads, and mapped reads as calculated from the post-filtering BAM.
 * **ClusterFactor** This is from **DeDup only**. This is a value representing how many duplicates in the library exist for each unique read. This ratio is calculated as `reads_before_deduplication / reads_after_deduplication`. Can be converted to %Dups by calculating `1 - (1  / CF)`. A cluster factor close to one indicates a highly complex library and could be sequenced further. Generally with a value of more than 2 you will not be gaining much more information by sequencing deeper.
-* **%Dups** This is from **Picard's markDuplicates only**. It represents the percentage of reads in your library that were exact duplicates of other reads in your library. The lower the better, as high duplication rate means lots of sequencing of the same information (and therefore is not time or cost effective).
+* **% Dup. Mapped Reads** This is from **Picard's markDuplicates only**. It represents the percentage of reads in your library that were exact duplicates of other reads in your library. The lower the better, as high duplication rate means lots of sequencing of the same information (and therefore is not time or cost effective).
 * **X Prime Y>Z N base** These columns are from DamageProfiler. The prime numbers represent which end of the reads the damage is referring to. The Y>Z is the type of substitution (C>T is the true damage, G>A is the complementary). You should see for no- and half-UDG treatment a decrease in frequency from the 1st to 2nd base.
-* **Mean Read Length** This is from DamageProfiler. This is the mean length of all de-duplicated mapped reads. Ancient DNA normally will have a mean between 30-75, however this can vary.
-* **Median Read Length** This is from DamageProfiler. This is the median length of all de-duplicated mapped reads. Ancient DNA normally will have a mean between 30-75, however this can vary.
-* **Aligned** This is from Qualimap. This is the total number of _deduplicated_ reads that mapped to your reference genome. This is the **best** number to report for final mapped reads in final publications.
+* **Mean Length Mapped Reads** This is from DamageProfiler. This is the mean length of all de-duplicated mapped reads. Ancient DNA normally will have a mean between 30-75, however this can vary.
+* **Median Length Mapped Reads** This is from DamageProfiler. This is the median length of all de-duplicated mapped reads. Ancient DNA normally will have a mean between 30-75, however this can vary.
+* **Nr. Dedup. Mapped Reads** This is from Qualimap. This is the total number of _deduplicated_ reads that mapped to your reference genome. This is the **best** number to report for final mapped reads in final publications.
 * **Mean/Median Coverage** This is from Qualimap. This is the mean/median number of times a base on your reference genome was covered by a read (i.e. depth coverage). This average includes bases with 0 reads covering that position.
 * **>= 1X** to **>= 5X** These are from Qualimap. This is the percentage of the genome covered at that particular depth coverage.
-* **% GC** This is the mean GC content in percent of all mapped reads post-deduplication. This should normally be close to the GC content of your reference genome.
+* **% GC Dedup. Mapped Reads** This is the mean GC content in percent of all mapped reads post-deduplication. This should normally be close to the GC content of your reference genome.
 * **MT to Nuclear Ratio** This from MTtoNucRatio. This reports the number of reads aligned to a mitochondrial entry in your reference FASTA to all other entries. This will typically be high but will vary depending on tissue type.
-* **XRate** This is from Sex.DetERRmine. This is the relative depth of coverage on the X-chromosome.
-* **YRate** This is from Sex.DetERRmine. This is the relative depth of coverage on the Y-chromosome.
+* **SexDet Rate X Chr** This is from Sex.DetERRmine. This is the relative depth of coverage on the X-chromosome.
+* **SexDet Rate Y Chr** This is from Sex.DetERRmine. This is the relative depth of coverage on the Y-chromosome.
 * **#SNPs Covered** This is from eigenstrat\_snp\_coverage. The number of called SNPs after genotyping with pileupcaller.
 * **#SNPs Total** This is from eigenstrat\_snp\_coverage. The maximum number of covered SNPs, i.e. the number of SNPs in the .snp file provided to pileupcaller with `--pileupcaller_snpfile`.
 * **Number of SNPs** This is from ANGSD. The number of SNPs left after removing sites with no data in a 5 base pair surrounding region.
@@ -701,5 +702,6 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir
     * finally, the `*.kraken.out` file are the direct output of Kraken2
 * `maltextract/`: this contains a `results` directory in which contains the output from MaltExtract - typically one folder for each filter type, an error and a log file. The characteristics of each node (e.g. damage, read lengths, edit distances - each in different txt formats) can be seen in each sub-folder of the filter folders. Output can be visualised either with the [HOPS postprocessing script](https://github.com/rhuebler/HOPS) or [MEx-IPA](https://github.com/jfy133/MEx-IPA)
 * `consensus_sequence/`: this contains three FASTA files from VCF2Genome of a consensus sequence based on the reference FASTA with each sample's unique modifications. The main FASTA is a standard file with bases not passing the specified thresholds as Ns. The two other FASTAS (`_refmod.fasta.gz`) and (`_uncertainity.fasta.gz`) are IUPAC uncertainty codes (rather than Ns) and a special number-based uncertainty system used for other downstream tools, respectively.
-* `librarymerged_bams/`: these contain the final BAM files that would go into genotyping (if genotyping is turned on). This means the files will contain all libraries of a given sample (including trimmed non-UDG or half-UDG treated libraries, if BAM trimming turned on)
+   `merged_bams/initial`: these contain the BAM files that would go into UDG-treatment specific BAM trimming. All libraries of the sample sample, **and** same UDG-treatment type will be in these BAM files.
+* `merged_bams/additional`: these contain the final BAM files that would go into genotyping (if genotyping is turned on). This means the files will contain all libraries of a given sample (including trimmed non-UDG or half-UDG treated libraries, if BAM trimming turned on)
 * `bcftools`: this currently contains a single directory called `stats/` that includes general statistics on variant callers producing VCF files as output by `bcftools stats`. These includethings such as the number of positions, number of transititions/transversions and depth coverage of SNPs etc. These are only produced if `--run_bcftools_stats` is supplied.
diff --git a/environment.yml b/environment.yml
diff --git a/main.nf b/main.nf
diff --git a/nextflow_schema.json b/nextflow_schema.json