Merge branch 'dev' into new-images

jfy133 · web-flow · commit 08a0894f4ee5 · 2020-12-24T15:22:02.000+01:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -146,16 +146,16 @@ jobs:
           nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_pmdtools
       - name: GENOTYPING_UG AND MULTIVCFANALYZER Test running GATK UnifiedGenotyper and MultiVCFAnalyzer, additional VCFS
         run: |
-         nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_genotyping --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' --run_multivcfanalyzer --additional_vcf_files 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/vcf/JK2772_CATCAGTGAGTAGA_L008_R1_001.fastq.gz.tengrand.fq.combined.fq.mapped_rmdup.bam.unifiedgenotyper.vcf.gz' --write_allele_frequencies
+          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_genotyping --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' --run_multivcfanalyzer --additional_vcf_files 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/vcf/JK2772_CATCAGTGAGTAGA_L008_R1_001.fastq.gz.tengrand.fq.combined.fq.mapped_rmdup.bam.unifiedgenotyper.vcf.gz' --write_allele_frequencies
       - name: COMPLEX LANE/LIBRARY MERGING Test running lane and library merging prior to GATK UnifiedGenotyper and running MultiVCFAnalyzer
         run: |
-         nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_complex,docker --run_genotyping --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' --run_multivcfanalyzer
+          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_complex,docker --run_genotyping --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' --run_multivcfanalyzer
       - name: GENOTYPING_UG ON TRIMMED BAM Test
         run: |
-         nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_genotyping --run_trim_bam --genotyping_source 'trimmed' --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP'
+          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_genotyping --run_trim_bam --genotyping_source 'trimmed' --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP'
       - name: BAM_INPUT Run the basic pipeline with the bam input profile, skip AdapterRemoval as no convertBam
         run: |
-         nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_bam,docker --skip_adapterremoval
+          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_bam,docker --skip_adapterremoval
       - name: BAM_INPUT Run the basic pipeline with the bam input profile, convert to FASTQ for adapterremoval test and downstream
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_bam,docker --run_convertinputbam
@@ -167,6 +167,9 @@ jobs:
       - name: METAGENOMIC Run the basic pipeline but with unmapped reads going into MALT
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering  --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt/" --malt_sam_output
+      - name: METAGENOMIC Run the basic pipeline but low-complexity filtered reads going into MALT
+        run: |
+          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering  --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt/" --metagenomic_complexity_filter
       - name: MALTEXTRACT Download resource files
         run: |
             mkdir -p databases/maltextract
@@ -186,3 +189,6 @@ jobs:
       - name: MTNUCRATIO Run basic pipeline with bam input profile, but don't convert BAM, skip everything but nmtnucratio
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_humanbam,docker --skip_fastqc --skip_adapterremoval --skip_deduplication --skip_qualimap --skip_preseq --skip_damage_calculation --run_mtnucratio
+      - name: RESCALING Run basic pipeline with basic pipeline but with mapDamage rescaling of BAM files. Note this will be slow
+        run: |
+          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_mapdamage_rescaling --run_genotyping --genotyping_tool hc --genotyping_source 'rescaled'
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,13 +7,17 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ### `Added`
 
+- [#640](https://github.com/nf-core/eager/issues/640) - Added a pre-metagenomic screening filtering of low-sequence complexity reads with `bbduk`
+- [#583](https://github.com/nf-core/eager/issues/583) - Added `mapDamage2` rescaling of BAM files to remove damage
 - Updated usage (merging files) and workflow images reflecting new functionality.
 
 ### `Fixed`
 
 - Removed leftover old DockerHub push CI commands.
-- [#627](https://github.com/nf-core/eager/issues/627) Added de Barros Damgaard citation to README
-- [#630](https://github.com/nf-core/eager/pull/630) Better handling of Qualimap memory requirements and error strategy.
+- [#627](https://github.com/nf-core/eager/issues/627) - Added de Barros Damgaard citation to README
+- [#630](https://github.com/nf-core/eager/pull/630) - Better handling of Qualimap memory requirements and error strategy.
+- Fixed some incomplete schema options to ensure users supply valid input values
+- [#638](https://github.com/nf-core/eager/issues/638#issuecomment-748877567) Fixed inverted circularfilter filtering (previously filtering would happen by default, not when requested by user as originally recorded in documentation)
 
 ### `Dependencies`
 
diff --git a/README.md b/README.md
@@ -25,7 +25,39 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool
     <img src="docs/images/usage/eager2_workflow.png" alt="nf-core/eager schematic workflow" width="70%"
 </p>
 
-## Pipeline steps
+## Quick Start
+
+1. Install [`nextflow`](https://nf-co.re/usage/installation) (version >= 20.04.0)
+
+2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) or [`Podman`](https://podman.io/) for full pipeline reproducibility _(please only use [`Conda`](https://conda.io/miniconda.html) as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_
+
+3. Download the pipeline and test it on a minimal dataset with a single command:
+
+    ```bash
+    nextflow run nf-core/eager -profile test,<docker/singularity/podman/conda/institute>
+    ```
+
+    > Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile <institute>` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment.
+
+4. Start running your own analysis!
+
+    ```bash
+    nextflow run nf-core/eager -profile <docker/singularity/conda> --input '*_R{1,2}.fastq.gz' --fasta '<your_reference>.fasta'
+    ```
+
+5. Once your run has completed successfully, clean up the intermediate files.
+
+    ```bash
+    nextflow clean -f -k
+    ```
+
+See [usage docs](https://nf-co.re/eager/docs/usage.md) for all of the available options when running the pipeline.
+
+**N.B.** You can see an overview of the run in the MultiQC report located at `./results/MultiQC/multiqc_report.html`
+
+Modifications to the default pipeline are easily made using various options as described in the documentation.
+
+## Pipeline Summary
 
 ### Default Steps
 
@@ -77,6 +109,7 @@ Additional functionality contained by the pipeline currently includes:
 
 #### Metagenomic Screening
 
+* Low-sequenced complexity filtering (`BBduk`)
 * Taxonomic binner with alignment (`MALT`)
 * Taxonomic binner without alignment (`Kraken2`)
 * aDNA characteristic screening of taxonomically binned data from MALT (`MaltExtract`)
@@ -89,48 +122,6 @@ A graphical overview of suggested routes through the pipeline depending on conte
     <img src="docs/images/usage/eager2_metromap_complex.png" alt="nf-core/eager metro map" width="70%"
 </p>
 
-## Quick Start
-
-1. Install [`nextflow`](https://nf-co.re/usage/installation) (version >= 20.04.0)
-
-2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) or [`Podman`](https://podman.io/) for full pipeline reproducibility _(please only use [`Conda`](https://conda.io/miniconda.html) as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_
-
-3. Download the pipeline and test it on a minimal dataset with a single command:
-
-    ```bash
-    nextflow run nf-core/eager -profile test,<docker/singularity/podman/conda/institute>
-    ```
-
-    > Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile <institute>` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment.
-
-4. Start running your own analysis!
-
-    ```bash
-    nextflow run nf-core/eager -profile <docker/singularity/conda> --input '*_R{1,2}.fastq.gz' --fasta '<your_reference>.fasta'
-    ```
-
-5. Once your run has completed successfully, clean up the intermediate files.
-
-    ```bash
-    nextflow clean -f -k
-    ```
-
-See [usage docs](https://nf-co.re/eager/docs/usage.md) for all of the available options when running the pipeline.
-
-**N.B.** You can see an overview of the run in the MultiQC report located at `./results/MultiQC/multiqc_report.html`
-
-Modifications to the default pipeline are easily made using various options
-as described in the documentation.
-
-## Pipeline Summary
-
-By default, the pipeline currently performs the following:
-
-<!-- TODO nf-core: Fill in short bullet-pointed list of default steps of pipeline -->
-
-* Sequencing quality control (`FastQC`)
-* Overall pipeline run summaries (`MultiQC`)
-
 ## Documentation
 
 The nf-core/eager pipeline comes with documentation about the pipeline: [usage](https://nf-co.re/eager/usage) and [output](https://nf-co.re/eager/output).
@@ -236,6 +227,8 @@ In addition, references of tools and data used in this pipeline are as follows:
 * **Bowtie2**  Langmead, B. and Salzberg, S. L. 2012 Fast gapped-read alignment with Bowtie 2. Nature methods, 9(4), p. 357–359. doi: [10.1038/nmeth.1923](https:/dx.doi.org/10.1038/nmeth.1923).
 * **sequenceTools** Stephan Schiffels (Unpublished). Download: [https://github.com/stschiff/sequenceTools](https://github.com/stschiff/sequenceTools)
 * **EigenstratDatabaseTools** Thiseas C. Lamnidis (Unpublished). Download: [https://github.com/TCLamnidis/EigenStratDatabaseTools.git](https://github.com/TCLamnidis/EigenStratDatabaseTools.git)
+* **mapDamage2** Jónsson, H., et al 2013. mapDamage2.0: fast approximate Bayesian estimates of ancient DNA damage parameters. Bioinformatics , 29(13), 1682–1684. [https://doi.org/10.1093/bioinformatics/btt193](https://doi.org/10.1093/bioinformatics/btt193)
+* **BBduk** Brian Bushnell (Unpublished). Download: [https://sourceforge.net/projects/bbmap/](sourceforge.net/projects/bbmap/)
 
 ## Data References
 
diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml
@@ -6,7 +6,6 @@ report_comment: >
     This report has been generated by the <a href="https://github.com/nf-core/eager" target="_blank">nf-core/eager</a>
     analysis pipeline. For information about how to interpret these results, please see the
     <a href="https://github.com/nf-core/eager" target="_blank">documentation</a>.
-
 run_modules:
     - adapterRemoval
     - bowtie2
@@ -270,4 +269,4 @@ report_section_order:
     nf-core-eager-summary:
         order: -1001
 
-export_plots: true
+export_plots: true
diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py
@@ -35,7 +35,9 @@
     'VCF2genome':['v_vcf2genome.txt', r"VCF2Genome \(v. ([0-9].[0-9]+) "],
     'endorS.py':['v_endorSpy.txt', r"endorS.py (\S+)"],
     'kraken':['v_kraken.txt', r"Kraken version (\S+)"],
-    'eigenstrat_snp_coverage':['v_eigenstrat_snp_coverage.txt',r"(\S+)"]
+    'eigenstrat_snp_coverage':['v_eigenstrat_snp_coverage.txt',r"(\S+)"],
+    'mapDamage2':['v_mapdamage.txt',r"(\S+)"],
+    'bbduk':['v_bbduk.txt',r"(\S+)"]
 }
 
 results = OrderedDict()
@@ -55,7 +57,7 @@
 results['Qualimap'] = '<span style="color:#999999;\">N/A</span>'
 results['Preseq'] = '<span style="color:#999999;\">N/A</span>'
 results['GATK HaplotypeCaller'] = '<span style="color:#999999;\">N/A</span>'
-#results['GATK UnifiedGenotyper'] = '<span style="color:#999999;\">N/A</span>'
+results['GATK UnifiedGenotyper'] = '<span style="color:#999999;\">N/A</span>'
 results['freebayes'] = '<span style="color:#999999;\">N/A</span>'
 results['sequenceTools'] = '<span style="color:#999999;\">N/A</span>'
 results['VCF2genome'] = '<span style="color:#999999;\">N/A</span>'
@@ -71,6 +73,9 @@
 results['kraken'] = '<span style="color:#999999;\">N/A</span>'
 results['maltextract'] = '<span style="color:#999999;\">N/A</span>'
 results['eigenstrat_snp_coverage'] = '<span style="color:#999999;\">N/A</span>'
+results['mapDamage2'] = '<span style="color:#999999;\">N/A</span>'
+results['bbduk'] = '<span style="color:#999999;\">N/A</span>'
+
 
 # Search each file using its regex
 for k, v in regexes.items():
diff --git a/conf/test_resources.config b/conf/test_resources.config
@@ -51,4 +51,8 @@ process {
       time = { check_max( 10.m * task.attempt, 'time' ) }
   }
 
+  withName:'mapdamage_rescaling'{
+      time = { check_max( 20.m * task.attempt, 'time' ) }
+  }
+
 }
diff --git a/docs/output.md b/docs/output.md
@@ -658,11 +658,13 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir
 - `damageprofiler/` - this contains sample specific directories containing raw statistics and damage plots from DamageProfiler. The `.pdf` files can be used to visualise C to T miscoding lesions or read length distributions of your mapped reads. All raw statistics used for the PDF plots are contained in the `.txt` files.
 - `pmdtools/` - this contains raw output statistics of pmdtools (estimates of frequencies of substitutions), and BAM files which have been filtered to remove reads that do not have a Post-mortem damage (PMD) score of `--pmdtools_threshold`.
 - `trimmed_bam/` - this contains the BAM files with X number of bases trimmed off as defined with the `--bamutils_clip_half_udg_left`, `--bamutils_clip_half_udg_right`, `--bamutils_clip_none_udg_left`, and `--bamutils_clip_none_udg_right` flags and corresponding index files. You can use these BAM files for downstream analysis such as re-mapping data with more stringent parameters (if you set trimming to remove the most likely places containing damage in the read).
+- `damage_rescaling/` - this contains rescaled BAM files from mapDamage2. These BAM files have damage probabilistically removed via a bayesian model, and can be used for downstream genotyping.
 - `genotyping/` - this contains all the (gzipped) genotyping files produced by your genotyping module. The file suffix will have the genotyping tool name. You will have files corresponding to each of your deduplicated BAM files (except pileupcaller), or any turned-on downstream processes that create BAMs (e.g. trimmed bams or pmd tools). If `--gatk_ug_keep_realign_bam` supplied, this may also contain BAM files from InDel realignment when using GATK 3 and UnifiedGenotyping for variant calling. When pileupcaller is used to create eigenstrat genotypes, this directory also contains eigenstrat SNP coverage statistics.
 - `multivcfanalyzer/` - this contains all output from MultiVCFAnalyzer, including SNP calling statistics, various SNP table(s) and FASTA alignment files.
 - `sex_determination/` - this contains the output for the sex determination run. This is a single `.tsv` file that includes a table with the sample name, the number of autosomal SNPs, number of SNPs on the X/Y chromosome, the number of reads mapping to the autosomes, the number of reads mapping to the X/Y chromosome, the relative coverage on the X/Y chromosomes, and the standard error associated with the relative coverages. These measures are provided for each bam file, one row per file. If the `sexdeterrmine_bedfile` option has not been provided, the error bars cannot be trusted, and runtime will be considerably longer.
 - `nuclear_contamination/` - this contains the output of the nuclear contamination processes. The directory contains one `*.X.contamination.out` file per individual, as well as `nuclear_contamination.txt` which is a summary table of the results for all individual. `nuclear_contamination.txt` contains a header, followed by one line per individual, comprised of the Method of Moments (MOM) and Maximum Likelihood (ML) contamination estimate (with their respective standard errors) for both Method1 and Method2.
 - `bedtools/` - this contains two files as the output from bedtools coverage. One file contains the 'breadth' coverage (`*.breadth.gz`). This file will have the contents of your annotation file (e.g. BED/GFF), and the following subsequent columns: no. reads on feature, # bases at depth, length of feature, and % of feature. The second file (`*.depth.gz`), contains the contents of your annotation file (e.g. BED/GFF), and an additional column which is mean depth coverage (i.e. average number of reads covering each position).
+- `metagenomic_complexity_filter` - this contains the output from filtering of input reads to metagenomic classification of low-sequence complexity reads as performed by `bbduk`. This will include the filtered FASTQ files (`*_lowcomplexityremoved.fq.gz`) and also the run-time log (`_bbduk.stats`) for each sample. **Note:** there are no sections in the MultiQC report for this module, therefore you must check the `._bbduk.stats` files to get summary statistics of the filtering.
 - `metagenomic_classification/` - this contains the output for a given metagenomic classifier.
   - Running MALT will contain RMA6 files that can be loaded into MEGAN6 or MaltExtract for phylogenetic visualisation of read taxonomic assignments and aDNA characteristics respectively. Additional a `malt.log` file is provided which gives additional information such as run-time, memory usage and per-sample statistics of numbers of alignments with taxonomic assignment etc. This will also include gzip SAM files if requested.
   - Running kraken will contain the Kraken output and report files, as well as a merged Taxon count table.
diff --git a/environment.yml b/environment.yml
@@ -47,3 +47,6 @@ dependencies:
   - conda-forge::xopen=0.9.0
   - bioconda::bowtie2=2.4.1
   - bioconda::eigenstratdatabasetools=1.0.2
+  - bioconda::mapdamage2=2.2.0
+  - bioconda::bbmap=38.87
+
diff --git a/main.nf b/main.nf
diff --git a/nextflow.config b/nextflow.config
diff --git a/nextflow_schema.json b/nextflow_schema.json

Original file line number	Diff line number	Diff line change
`@@ -51,4 +51,8 @@ process {`
`51`	`51`	`time = { check_max( 10.m * task.attempt, 'time' ) }`
`52`	`52`	`}`
`53`	`53`
	`54`	`+ withName:'mapdamage_rescaling'{`
	`55`	`+ time = { check_max( 20.m * task.attempt, 'time' ) }`
	`56`	`+ }`
	`57`	`+`
`54`	`58`	`}`