diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8edacdf46..346449488 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -34,13 +34,13 @@ jobs:
- name: Build new docker image
if: env.MATCHED_FILES
- run: docker build --no-cache . -t nfcore/eager:2.2.2
+ run: docker build --no-cache . -t nfcore/eager:2.3
- name: Pull docker image
if: ${{ !env.MATCHED_FILES }}
run: |
docker pull nfcore/eager:dev
- docker tag nfcore/eager:dev nfcore/eager:2.2.2
+ docker tag nfcore/eager:dev nfcore/eager:2.3
- name: Install Nextflow
env:
@@ -146,16 +146,16 @@ jobs:
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_pmdtools
- name: GENOTYPING_UG AND MULTIVCFANALYZER Test running GATK UnifiedGenotyper and MultiVCFAnalyzer, additional VCFS
run: |
- nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_genotyping --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' --run_multivcfanalyzer --additional_vcf_files 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/vcf/JK2772_CATCAGTGAGTAGA_L008_R1_001.fastq.gz.tengrand.fq.combined.fq.mapped_rmdup.bam.unifiedgenotyper.vcf.gz' --write_allele_frequencies
+ nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_genotyping --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' --run_multivcfanalyzer --additional_vcf_files 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/vcf/JK2772_CATCAGTGAGTAGA_L008_R1_001.fastq.gz.tengrand.fq.combined.fq.mapped_rmdup.bam.unifiedgenotyper.vcf.gz' --write_allele_frequencies
- name: COMPLEX LANE/LIBRARY MERGING Test running lane and library merging prior to GATK UnifiedGenotyper and running MultiVCFAnalyzer
run: |
- nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_complex,docker --run_genotyping --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' --run_multivcfanalyzer
+ nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_complex,docker --run_genotyping --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' --run_multivcfanalyzer
- name: GENOTYPING_UG ON TRIMMED BAM Test
run: |
- nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_genotyping --run_trim_bam --genotyping_source 'trimmed' --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP'
+ nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_genotyping --run_trim_bam --genotyping_source 'trimmed' --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP'
- name: BAM_INPUT Run the basic pipeline with the bam input profile, skip AdapterRemoval as no convertBam
run: |
- nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_bam,docker --skip_adapterremoval
+ nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_bam,docker --skip_adapterremoval
- name: BAM_INPUT Run the basic pipeline with the bam input profile, convert to FASTQ for adapterremoval test and downstream
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_bam,docker --run_convertinputbam
@@ -167,6 +167,9 @@ jobs:
- name: METAGENOMIC Run the basic pipeline but with unmapped reads going into MALT
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt/" --malt_sam_output
+ - name: METAGENOMIC Run the basic pipeline but low-complexity filtered reads going into MALT
+ run: |
+ nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt/" --metagenomic_complexity_filter
- name: MALTEXTRACT Download resource files
run: |
mkdir -p databases/maltextract
@@ -186,34 +189,6 @@ jobs:
- name: MTNUCRATIO Run basic pipeline with bam input profile, but don't convert BAM, skip everything but nmtnucratio
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_humanbam,docker --skip_fastqc --skip_adapterremoval --skip_deduplication --skip_qualimap --skip_preseq --skip_damage_calculation --run_mtnucratio
-
- push_dockerhub:
- name: Push new Docker image to Docker Hub
- runs-on: ubuntu-latest
- # Only run if the tests passed
- needs: test
- # Only run for the nf-core repo, for releases and merged PRs
- if: ${{ github.repository == 'nf-core/eager' && (github.event_name == 'release' || github.event_name == 'push') }}
- env:
- DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
- DOCKERHUB_PASS: ${{ secrets.DOCKERHUB_PASS }}
- steps:
- - name: Check out pipeline code
- uses: actions/checkout@v2
-
- - name: Build new docker image
- run: docker build --no-cache . -t nfcore/eager:latest
-
- - name: Push Docker image to DockerHub (dev)
- if: ${{ github.event_name == 'push' }}
- run: |
- echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin
- docker tag nfcore/eager:latest nfcore/eager:dev
- docker push nfcore/eager:dev
- - name: Push Docker image to DockerHub (release)
- if: ${{ github.event_name == 'release' }}
- run: |
- echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin
- docker push nfcore/eager:latest
- docker tag nfcore/eager:latest nfcore/eager:${{ github.ref }}
- docker push nfcore/eager:${{ github.ref }}
+ - name: RESCALING Run basic pipeline with basic pipeline but with mapDamage rescaling of BAM files. Note this will be slow
+ run: |
+ nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_mapdamage_rescaling --run_genotyping --genotyping_tool hc --genotyping_source 'rescaled'
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 39aba1bc7..e30265ed1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,30 @@
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
+## [2.3.0] - 2021-01-11 - "Aalen"
+
+### `Added`
+
+- [#640](https://github.com/nf-core/eager/issues/640) - Added a pre-metagenomic screening filtering of low-sequence complexity reads with `bbduk`
+- [#583](https://github.com/nf-core/eager/issues/583) - Added `mapDamage2` rescaling of BAM files to remove damage
+- Updated usage (merging files) and workflow images reflecting new functionality.
+
+### `Fixed`
+
+- Removed leftover old DockerHub push CI commands.
+- [#627](https://github.com/nf-core/eager/issues/627) - Added de Barros Damgaard citation to README
+- [#630](https://github.com/nf-core/eager/pull/630) - Better handling of Qualimap memory requirements and error strategy.
+- Fixed some incomplete schema options to ensure users supply valid input values
+- [#638](https://github.com/nf-core/eager/issues/638#issuecomment-748877567) Fixed inverted circularfilter filtering (previously filtering would happen by default, not when requested by user as originally recorded in documentation)
+- [DeDup:](https://github.com/apeltzer/DeDup/commit/07d47868f10a6830da8c9161caa3755d9da155bf) Fixed Null Pointer Bug in DeDup by updating to 0.12.8 version
+- [#650](https://github.com/nf-core/eager/pull/650) - Increased memory given to FastQC for larger files by making it multithreaded
+
+### `Dependencies`
+
+- Update: DeDup v0.12.7 to v0.12.8
+
+### `Deprecated`
+
## [2.2.2] - 2020-12-09
### `Added`
diff --git a/Dockerfile b/Dockerfile
index b9d2d771d..4e8aad0c8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,10 +7,10 @@ COPY environment.yml /
RUN conda env create --quiet -f /environment.yml && conda clean -a
# Add conda installation dir to PATH (instead of doing 'conda activate')
-ENV PATH /opt/conda/envs/nf-core-eager-2.2.2/bin:$PATH
+ENV PATH /opt/conda/envs/nf-core-eager-2.3/bin:$PATH
# Dump the details of the installed packages to a file for posterity
-RUN conda env export --name nf-core-eager-2.2.2 > nf-core-eager-2.2.2.yml
+RUN conda env export --name nf-core-eager-2.3 > nf-core-eager-2.3.yml
# Instruct R processes to use these empty files instead of clashing with a local version
RUN touch .Rprofile
diff --git a/README.md b/README.md
index 0a9675aed..67daebd3d 100644
--- a/README.md
+++ b/README.md
@@ -22,10 +22,42 @@
The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker containers making installation trivial and results highly reproducible. The pipeline pre-processes raw data from FASTQ inputs, or preprocessed BAM inputs. It can align reads and performs extensive general NGS and aDNA specific quality-control on the results. It comes with docker, singularity or conda containers making installation trivial and results highly reproducible.
-
-## Pipeline steps
+## Quick Start
+
+1. Install [`nextflow`](https://nf-co.re/usage/installation) (version >= 20.04.0)
+
+2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) or [`Podman`](https://podman.io/) for full pipeline reproducibility _(please only use [`Conda`](https://conda.io/miniconda.html) as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_
+
+3. Download the pipeline and test it on a minimal dataset with a single command:
+
+ ```bash
+ nextflow run nf-core/eager -profile test_tsv,
+ ```
+
+ > Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment.
+
+4. Start running your own analysis!
+
+ ```bash
+ nextflow run nf-core/eager -profile --input '*_R{1,2}.fastq.gz' --fasta '.fasta'
+ ```
+
+5. Once your run has completed successfully, clean up the intermediate files.
+
+ ```bash
+ nextflow clean -f -k
+ ```
+
+See [usage docs](https://nf-co.re/eager/docs/usage.md) for all of the available options when running the pipeline.
+
+**N.B.** You can see an overview of the run in the MultiQC report located at `./results/MultiQC/multiqc_report.html`
+
+Modifications to the default pipeline are easily made using various options as described in the documentation.
+
+## Pipeline Summary
### Default Steps
@@ -77,6 +109,7 @@ Additional functionality contained by the pipeline currently includes:
#### Metagenomic Screening
+* Low-sequenced complexity filtering (`BBduk`)
* Taxonomic binner with alignment (`MALT`)
* Taxonomic binner without alignment (`Kraken2`)
* aDNA characteristic screening of taxonomically binned data from MALT (`MaltExtract`)
@@ -86,51 +119,9 @@ Additional functionality contained by the pipeline currently includes:
A graphical overview of suggested routes through the pipeline depending on context can be seen below.
-
-## Quick Start
-
-1. Install [`nextflow`](https://nf-co.re/usage/installation) (version >= 20.04.0)
-
-2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) or [`Podman`](https://podman.io/) for full pipeline reproducibility _(please only use [`Conda`](https://conda.io/miniconda.html) as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_
-
-3. Download the pipeline and test it on a minimal dataset with a single command:
-
- ```bash
- nextflow run nf-core/eager -profile test,
- ```
-
- > Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment.
-
-4. Start running your own analysis!
-
- ```bash
- nextflow run nf-core/eager -profile --input '*_R{1,2}.fastq.gz' --fasta '.fasta'
- ```
-
-5. Once your run has completed successfully, clean up the intermediate files.
-
- ```bash
- nextflow clean -f -k
- ```
-
-See [usage docs](https://nf-co.re/eager/docs/usage.md) for all of the available options when running the pipeline.
-
-**N.B.** You can see an overview of the run in the MultiQC report located at `./results/MultiQC/multiqc_report.html`
-
-Modifications to the default pipeline are easily made using various options
-as described in the documentation.
-
-## Pipeline Summary
-
-By default, the pipeline currently performs the following:
-
-
-
-* Sequencing quality control (`FastQC`)
-* Overall pipeline run summaries (`MultiQC`)
-
## Documentation
The nf-core/eager pipeline comes with documentation about the pipeline: [usage](https://nf-co.re/eager/usage) and [output](https://nf-co.re/eager/output).
@@ -236,6 +227,8 @@ In addition, references of tools and data used in this pipeline are as follows:
* **Bowtie2** Langmead, B. and Salzberg, S. L. 2012 Fast gapped-read alignment with Bowtie 2. Nature methods, 9(4), p. 357–359. doi: [10.1038/nmeth.1923](https:/dx.doi.org/10.1038/nmeth.1923).
* **sequenceTools** Stephan Schiffels (Unpublished). Download: [https://github.com/stschiff/sequenceTools](https://github.com/stschiff/sequenceTools)
* **EigenstratDatabaseTools** Thiseas C. Lamnidis (Unpublished). Download: [https://github.com/TCLamnidis/EigenStratDatabaseTools.git](https://github.com/TCLamnidis/EigenStratDatabaseTools.git)
+* **mapDamage2** Jónsson, H., et al 2013. mapDamage2.0: fast approximate Bayesian estimates of ancient DNA damage parameters. Bioinformatics , 29(13), 1682–1684. [https://doi.org/10.1093/bioinformatics/btt193](https://doi.org/10.1093/bioinformatics/btt193)
+* **BBduk** Brian Bushnell (Unpublished). Download: [https://sourceforge.net/projects/bbmap/](sourceforge.net/projects/bbmap/)
## Data References
@@ -244,3 +237,4 @@ This repository uses test data from the following studies:
* Fellows Yates, J. A. et al. (2017) ‘Central European Woolly Mammoth Population Dynamics: Insights from Late Pleistocene Mitochondrial Genomes’, Scientific reports, 7(1), p. 17714. [doi: 10.1038/s41598-017-17723-1](https://doi.org/10.1038/s41598-017-17723-1).
* Gamba, C. et al. (2014) ‘Genome flux and stasis in a five millennium transect of European prehistory’, Nature communications, 5, p. 5257. [doi: 10.1038/ncomms6257](https://doi.org/10.1038/ncomms6257).
* Star, B. et al. (2017) ‘Ancient DNA reveals the Arctic origin of Viking Age cod from Haithabu, Germany’, Proceedings of the National Academy of Sciences of the United States of America, 114(34), pp. 9152–9157. [doi: 10.1073/pnas.1710186114](https://doi.org/10.1073/pnas.1710186114).
+* de Barros Damgaard, P. et al. (2018). '137 ancient human genomes from across the Eurasian steppes.', Nature, 557(7705), 369–374. [doi: 10.1038/s41586-018-0094-2](https://doi.org/10.1038/s41586-018-0094-2)
diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml
index 7fc6cabd5..c105fcb4e 100644
--- a/assets/multiqc_config.yaml
+++ b/assets/multiqc_config.yaml
@@ -6,7 +6,6 @@ report_comment: >
This report has been generated by the nf-core/eager
analysis pipeline. For information about how to interpret these results, please see the
documentation.
-
run_modules:
- adapterRemoval
- bowtie2
@@ -270,4 +269,4 @@ report_section_order:
nf-core-eager-summary:
order: -1001
-export_plots: true
+export_plots: true
\ No newline at end of file
diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py
index 201df4a58..5c9c0da9c 100755
--- a/bin/scrape_software_versions.py
+++ b/bin/scrape_software_versions.py
@@ -35,7 +35,9 @@
'VCF2genome':['v_vcf2genome.txt', r"VCF2Genome \(v. ([0-9].[0-9]+) "],
'endorS.py':['v_endorSpy.txt', r"endorS.py (\S+)"],
'kraken':['v_kraken.txt', r"Kraken version (\S+)"],
- 'eigenstrat_snp_coverage':['v_eigenstrat_snp_coverage.txt',r"(\S+)"]
+ 'eigenstrat_snp_coverage':['v_eigenstrat_snp_coverage.txt',r"(\S+)"],
+ 'mapDamage2':['v_mapdamage.txt',r"(\S+)"],
+ 'bbduk':['v_bbduk.txt',r"(\S+)"]
}
results = OrderedDict()
@@ -55,7 +57,7 @@
results['Qualimap'] = 'N/A'
results['Preseq'] = 'N/A'
results['GATK HaplotypeCaller'] = 'N/A'
-#results['GATK UnifiedGenotyper'] = 'N/A'
+results['GATK UnifiedGenotyper'] = 'N/A'
results['freebayes'] = 'N/A'
results['sequenceTools'] = 'N/A'
results['VCF2genome'] = 'N/A'
@@ -71,6 +73,8 @@
results['kraken'] = 'N/A'
results['maltextract'] = 'N/A'
results['eigenstrat_snp_coverage'] = 'N/A'
+results['mapDamage2'] = 'N/A'
+results['bbduk'] = 'N/A'
# Search each file using its regex
for k, v in regexes.items():
diff --git a/conf/base.config b/conf/base.config
index dc58944ba..8266b9c72 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -74,7 +74,7 @@ process {
}
withName:qualimap{
- errorStrategy = 'ignore'
+ errorStrategy = { task.exitStatus in [1,143,137,104,134,139] ? 'retry' : 'finish' }
}
withName:preseq {
diff --git a/conf/test_resources.config b/conf/test_resources.config
index 109e93e4c..74bb4ce2a 100644
--- a/conf/test_resources.config
+++ b/conf/test_resources.config
@@ -51,4 +51,8 @@ process {
time = { check_max( 10.m * task.attempt, 'time' ) }
}
+ withName:'mapdamage_rescaling'{
+ time = { check_max( 20.m * task.attempt, 'time' ) }
+ }
+
}
\ No newline at end of file
diff --git a/docs/images/output/overview/eager2_metromap_complex.png b/docs/images/output/overview/eager2_metromap_complex.png
deleted file mode 100644
index ae866ea96..000000000
Binary files a/docs/images/output/overview/eager2_metromap_complex.png and /dev/null differ
diff --git a/docs/images/output/overview/eager2_workflow.png b/docs/images/output/overview/eager2_workflow.png
deleted file mode 100644
index 10f10fc06..000000000
Binary files a/docs/images/output/overview/eager2_workflow.png and /dev/null differ
diff --git a/docs/images/output/overview/eager2_workflow.svg b/docs/images/output/overview/eager2_workflow.svg
deleted file mode 100644
index 940990523..000000000
--- a/docs/images/output/overview/eager2_workflow.svg
+++ /dev/null
@@ -1,1622 +0,0 @@
-
-
diff --git a/docs/images/usage/eager2_metromap_complex.png b/docs/images/usage/eager2_metromap_complex.png
new file mode 100644
index 000000000..244bd76ad
Binary files /dev/null and b/docs/images/usage/eager2_metromap_complex.png differ
diff --git a/docs/images/output/overview/eager2_metromap_complex.svg b/docs/images/usage/eager2_metromap_complex.svg
similarity index 91%
rename from docs/images/output/overview/eager2_metromap_complex.svg
rename to docs/images/usage/eager2_metromap_complex.svg
index 7e636788f..930a940a9 100644
--- a/docs/images/output/overview/eager2_metromap_complex.svg
+++ b/docs/images/usage/eager2_metromap_complex.svg
@@ -7,14 +7,14 @@
xmlns="http://www.w3.org/2000/svg"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
- width="297mm"
- height="210mm"
- viewBox="0 0 297 210"
+ width="297.05508mm"
+ height="170.45857mm"
+ viewBox="0 0 297.05508 170.45857"
version="1.1"
id="svg8"
- inkscape:version="1.0.1 (1.0.1+r73)"
- sodipodi:docname="EAGER_lonundmapsvg_complex_fin.svg"
- inkscape:export-filename="/home/jfellows/Documents/graphics/diagrams/EAGER_lonundmapsvg_complex_fin.png"
+ inkscape:version="1.0.1 (1.0.1+r74)"
+ sodipodi:docname="eager2_metromap_complex_v2.svg"
+ inkscape:export-filename="/home/jfellows/Documents/graphics/diagrams/eager2_metromap_complex_v2.png"
inkscape:export-xdpi="300"
inkscape:export-ydpi="300">
+ inkscape:snap-nodes="false"
+ fit-margin-top="0"
+ fit-margin-left="0"
+ fit-margin-right="0"
+ fit-margin-bottom="0">
+ dotted="true"
+ originx="0.32694881"
+ originy="-15.312377" />
@@ -175,21 +181,22 @@
image/svg+xml
-
+
+ id="layer1"
+ transform="translate(0.32694882,-15.312377)">
+ x="-1.7567509"
+ y="17.763466" />
@@ -558,7 +565,7 @@
sodipodi:nodetypes="csssc" />
@@ -654,18 +661,18 @@
EndorSpy
- GATK's UnifiedGenotyperGATK's HaplotypeCallerSequenceTools' PileupCallerFreebayesANGSDPMDToolsBamUtils
+ x="228.58232"
+ y="66.459801"
+ style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:3.175px;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro';text-align:start;text-anchor:start;stroke-width:0.264583"
+ id="tspan1220">PMDToolsBamUtilsmapDamage2's rescaleNuclearContamination(Human)PreSeqMtNucRatio(filtering)samtools' indexKraken ParseMaltExtractMALTKraken(Convert BAM)
FastQCfastpAdapterRemovalFastQC
+ GATK's UnifiedGenotyperGATK's HaplotypeCallersequenceTool's pileupCallerfreebayesANGSD
+
+ BBduk
+
diff --git a/docs/images/usage/eager2_workflow.png b/docs/images/usage/eager2_workflow.png
new file mode 100644
index 000000000..628fbab0a
Binary files /dev/null and b/docs/images/usage/eager2_workflow.png differ
diff --git a/docs/images/usage/eager2_workflow.svg b/docs/images/usage/eager2_workflow.svg
new file mode 100644
index 000000000..b39f52c90
--- /dev/null
+++ b/docs/images/usage/eager2_workflow.svg
@@ -0,0 +1,1807 @@
+
+
diff --git a/docs/images/usage/merging_files.png b/docs/images/usage/merging_files.png
index 55a885b9e..72f1c487a 100644
Binary files a/docs/images/usage/merging_files.png and b/docs/images/usage/merging_files.png differ
diff --git a/docs/images/usage/merging_files.svg b/docs/images/usage/merging_files.svg
index d3f61cc7d..a3c13f2b4 100644
--- a/docs/images/usage/merging_files.svg
+++ b/docs/images/usage/merging_files.svg
@@ -1,4 +1,6 @@
+
+
+> Only different libraries from a single sample that have been BAM trimmed will be merged together. Rescaled or PMD filtered libraries will not be merged prior genotyping as each library _may_ have a different model applied to it and have their own biases (i.e. users may need to play around with settings to get the damage-removal optimal).
+
The use of the TSV `--input` method is recommended when performing more complex procedures such as lane or library merging. You do not need to specify `--single_end`, `--bam`, `--colour_chemistry`, `-udg_type` etc. when using TSV input - this is defined within the TSV file itself. You can only supply a single TSV per run (i.e. `--input '*.tsv'` will not work).
This TSV should look like the following:
@@ -391,203 +393,6 @@ hard drive footprint of the run, so be sure to do this!
## Troubleshooting and FAQs
-### My pipeline update doesn't seem to do anything
-
-To download a new version of a pipeline, you can use the following, replacing
-`` to the corresponding version.
-
-```bash
-nextflow pull nf-core/eager -r
-```
-
-However, in very rare cases, minor fixes to a version will be pushed out without
-a version number bump. This can confuse nextflow slightly, as it thinks you
-already have the 'broken' version from your original pipeline download.
-
-If when running the pipeline you don't see any changes in the fixed version when
-running it, you can try removing your nextflow EAGER cache typically stored in
-your home directory with
-
-```bash
-rm -r ~/.nextflow/assets/nf-core/eager
-```
-
-And re-pull the pipeline with the command above. This will install a fresh
-version of the version with the fixes.
-
-### Input files not found
-
-When using the [direct input](#direct-input-method) method: if no file, only one
-input file, or only 'read one' and not 'read two' is picked up then something is
-likely wrong with your input file declaration ([`--input`](#--input)):
-
-1. The path must be enclosed in quotes (`'` or `"`)
-2. The path must have at least one `*` wildcard character. This is even if you
- are only running one paired end sample.
-3. When using the pipeline with paired end data, the path must use `{1,2}` or
- `{R1,R2}` notation to specify read pairs.
-4. If you are running single-end data make sure to specify `--single_end`
-
-**Important**: The pipeline can't take a list of multiple input files when using
-the direct input method - it takes a 'glob' expression. If your input files are
-scattered in different paths then we recommend that you generate a directory
-with symlinked files. If running in paired-end mode please make sure that your
-files are sensibly named so that they can be properly paired. See the previous
-point.
-
-If the pipeline can't find your files then you will get the following error
-
-```bash
-ERROR ~ Cannot find any reads matching: *{1,2}.fastq.gz
-```
-
-If your sample name is "messy" then you have to be very particular with your
-glob specification. A file name like `L1-1-D-2h_S1_L002_R1_001.fastq.gz` can be
-difficult enough for a human to read. Specifying `*{1,2}*.gz` won't work give
-you what you want whilst `*{R1,R2}*.gz` (i.e. the addition of the `R`s) will.
-
-If using the [TSV input](#tsv-input-method) method, this likely means there is a
-mistake or typo in the path in a given column. Often this is a trailing space at
-the end of the path.
-
-### I am only getting output for a single sample although I specified multiple with wildcards
-
-You must specify paths to files in quotes, otherwise your shell will evaluate
-any wildcards (\*) rather than Nextflow.
-
-For example
-
-```bash
-nextflow run nf-core/eager --input /path/to/sample_*/*.fq.gz
-```
-
-Would be evaluated by your shell as
-
-```bash
-nextflow run nf-core/eager --input /path/to/sample_1/sample_1.fq.gz /path/to/sample_1/sample_1.fq.gz /path/to/sample_1/sample_1.fq.gz
-```
-
-And Nextflow will only take the first path after `--input`, ignoring the others.
-
-On the other hand, encapsulating the path in quotes will allow Nextflow to
-evaluate the paths.
-
-```bash
-nextflow run nf-core/eager --input "/path/to/sample_*/*.fq.gz"
-```
-
-### The pipeline crashes almost immediately with an early pipeline step
-
-Sometimes a newly downloaded and set up nf-core/eager pipeline will encounter an
-issue where a run almost immediately crashes (e.g. at `fastqc`,
-`output_documentation` etc.) saying the tool could not be found or similar.
-
-#### I am running Docker
-
-You may have an outdated container. This happens more often when running on the
-`dev` branch of nf-core/eager, because Docker will _not_ update the container on
-each new commit, and thus may not get new tools called within the pipeline code.
-
-To fix, just re-pull the nf-core/eager Docker container manually with:
-
-```bash
-docker pull nfcore/eager:dev
-```
-
-#### I am running Singularity
-
-If you're running Singularity, it could be that Nextflow cannot access your
-Singularity image properly - often due to missing bind paths.
-
-See
-[here](https://nf-co.re/usage/troubleshooting#cannot-find-input-files-when-using-singularity)
-for more information.
-
-### The pipeline has crashed with an error but Nextflow is still running
-
-If this happens, you can either wait until all other already running jobs to
-safely finish, or if Nextflow _still_ does not stop press `ctrl + c` on your
-keyboard (or equivalent) to stop the Nextflow run.
-
-> :warning: if you do this, and do not plan to fix the run make sure to delete
-the output folder. Otherwise you may end up a lot of large intermediate files
-being left! You can clean a Nextflow run of all intermediate files with
-`nextflow clean -f -k` or delete the `work/` directory.
-
-### I get a exceeded job memory limit error
-
-While Nextflow tries to make your life easier by automatically retrying jobs
-that run out of memory with more resources (until your specified max-limit),
-sometimes you may have such large data you run out even after the default 3
-retries.
-
-To fix this you need to change the default memory requirements for the process
-that is breaking. We can do this by making a custom profile, which we then
-provide to the Nextflow run command.
-
-For example, lets say it's the `markduplicates` process that is running out of
-memory.
-
-First we need to check to see what default memory value we have. We can do this
-by going to the main [nf-core/eager code](https://github.com/nf-core/) and
-opening the `main.nf` file. We can then use your browser's find functionality
-for: `process markduplicates`.
-
-Once found, we then need to check the line called `label`. In this case the
-label is `mc_small` (for multi-core small).
-
-Next we need to go back to the main github repository, and open
-`conf/base.config`. Again using our find functionality, we search for:
-`withLabel:'mc_small'`.
-
-We see that the `memory` is set to `4.GB` (`memory = { check_max( 4.GB *
-task.attempt, 'memory' )})`).
-
-Now back on your computer, we need to make a new file called
-`custom_resources.conf`. You should save it somewhere centrally so you can
-reuse it.
-
-> If you think this would be useful for multiple people in your lab/institute,
-> we highly recommend you make an institutional profile at
-> [nf-core/configs](https://github.com/nf-core/configs). This will simplify this
-> process in the future.
-
-Within this file, you will need to add the following:
-
-```txt
-profiles {
- big_data {
- process {
- withName: markduplicates {
- memory = 16.GB
- }
- }
- }
-}
-```
-
-Where we have increased the default `4.GB` to `16.GB`. Make sure that you keep
-the `check_max` function, as this prevents your run asking for too much memory
-during retries.
-
-> Note that with this you will _not_ have the automatic retry mechanism. If
-> you want this, re-add the `check_max()` function on the `memory` line, and
-> add to the bottom of the entire file (outside the profiles block), the
-> block starting `def check_max(obj, type) {`, which is at the end of the
-> [nextflow.config file](https://github.com/nf-core/eager/blob/master/nextflow.config)
-
-Once saved, we can then modify your original Nextflow run command:
-
-```bash
-nextflow run nf-core/eager -r 2.2.0 -c ///custom_resources.conf -profile big_data,, <...>
-```
-
-Where we have added `-c` to specify which file to use for the custom profiles,
-and then added the `big_data` profile to the original profiles you were using.
-
-:warning: it's important that big_data comes first, to ensure it overwrites any
-parameters set in the subsequent profiles!
-
### I get a file name collision error during merging
When using TSV input, nf-core/eager will attempt to merge all `Lanes` of a
@@ -608,37 +413,6 @@ they are unique (e.g. if one library was sequenced on Lane 8 of two HiSeq runs,
specify lanes as 8 and 16 for each FASTQ file respectively). For library merging
errors, you must modify your `Library_ID`s accordingly, to make them unique.
-### I specified a module and it didn't produce the expected output
-
-Possible options:
-
-1. Check there if you have a typo in the parameter name. Nextflow _does not_
- check for this
-2. Check that an upstream module was turned on (if a module requires the output
- of a previous module, it will not be activated unless it receives the output)
-
-### I get a unable to acquire lock
-
-Errors like the following
-
-```bash
-Unable to acquire lock on session with ID 84333844-66e3-4846-a664-b446d070f775
-```
-
-normally suggest a previous Nextflow run (on the same folder) was not cleanly
-killed by a user (e.g. using ctrl + z to hard kill a crashed run).
-
-To fix this, you must clean the entirety of the output directory (including
-output files) e.g. with `rm -r /* /.*` and re-running
-from scratch.
-
-`ctrl +z` is **not** a recommended way of killing a Nextflow job. Runs that take
-a long time to fail are often still running because other job submissions are
-still running. Nextflow will normally wait for those processes to complete
-before cleaning shutting down the run (to allow rerunning of a run with
-`-resume`). `ctrl + c` is much safer as it will tell Nextflow to stop earlier
-but cleanly.
-
## Tutorials
### Tutorial - How to investigate a failed run
diff --git a/environment.yml b/environment.yml
index e0e05c545..afa4cf6f7 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,6 +1,6 @@
# You can use this file to create a conda environment for this pipeline:
# conda env create -f environment.yml
-name: nf-core-eager-2.2.2
+name: nf-core-eager-2.3
channels:
- conda-forge
- bioconda
@@ -18,7 +18,7 @@ dependencies:
- bioconda::bwa=0.7.17
- bioconda::picard=2.22.9
- bioconda::samtools=1.9
- - bioconda::dedup=0.12.7
+ - bioconda::dedup=0.12.8
- bioconda::angsd=0.933
- bioconda::circularmapper=1.93.5
- bioconda::gatk4=4.1.7.0
@@ -47,3 +47,6 @@ dependencies:
- conda-forge::xopen=0.9.0
- bioconda::bowtie2=2.4.1
- bioconda::eigenstratdatabasetools=1.0.2
+ - bioconda::mapdamage2=2.2.0
+ - bioconda::bbmap=38.87
+
diff --git a/main.nf b/main.nf
index ee9883454..3eaac7434 100644
--- a/main.nf
+++ b/main.nf
@@ -87,7 +87,7 @@ def helpMessage() {
--bwaalnl [num] Specify the -l parameter for BWA aln, i.e. length of seeds to be used. Set to 1024 for whole read. Default: ${params.bwaalnl}
--circularextension [num] Specify the number of bases to extend reference by (circularmapper only). Default: ${params.circularextension}
--circulartarget [chr] Specify the FASTA header of the target chromosome to extend(circularmapper only). Default: '${params.circulartarget}'
- --circularfilter [bool] Turn on to filter off-target reads (circularmapper only).
+ --circularfilter [bool] Turn on to remove reads that did not map to the circularised genome (circularmapper only).
--bt2_alignmode [str] Specify the bowtie2 alignment mode. Options: 'local', 'end-to-end'. Default: '${params.bt2_alignmode}'
--bt2_sensitivity [str] Specify the level of sensitivity for the bowtie2 alignment mode. Options: 'no-preset', 'very-fast', 'fast', 'sensitive', 'very-sensitive'. Default: '${params.bt2_sensitivity}'
--bt2n [num] Specify the -N parameter for bowtie2 (mismatches in seed). This will override defaults from alignmode/sensitivity. Default: ${params.bt2n}
@@ -116,12 +116,15 @@ def helpMessage() {
--damageprofiler_length [num] Specify length filter for DamageProfiler. Default: ${params.damageprofiler_length}
--damageprofiler_threshold [num] Specify number of bases of each read to consider for DamageProfiler calculations. Default: ${params.damageprofiler_threshold}
--damageprofiler_yaxis [float] Specify the maximum misincorporation frequency that should be displayed on damage plot. Set to 0 to 'autoscale'. Default: ${params.damageprofiler_yaxis}
+ --run_mapdamage_rescaling Turn on damage rescaling of BAM files using mapDamage2 to probabilistically remove damage.
+ --rescale_length_5p Length of read for mapDamage2 to rescale from 5p end. Default: ${params.rescale_length_5p}
+ --rescale_length_3p Length of read for mapDamage2 to rescale from 5p end. Default: ${params.rescale_length_3p}
--run_pmdtools [bool] Turn on PMDtools
--pmdtools_range [num] Specify range of bases for PMDTools. Default: ${params.pmdtools_range}
--pmdtools_threshold [num] Specify PMDScore threshold for PMDTools. Default: ${params.pmdtools_threshold}
--pmdtools_reference_mask [file] Specify a path to reference mask for PMDTools.
--pmdtools_max_reads [num] Specify the maximum number of reads to consider for metrics generation. Default: ${params.pmdtools_max_reads}
-
+
Annotation Statistics
--run_bedtools_coverage [bool] Turn on ability to calculate no. reads, depth and breadth coverage of features in reference.
--anno_file [file] Path to GFF or BED file containing positions of features in reference file (--fasta). Path should be enclosed in quotes.
@@ -137,7 +140,7 @@ def helpMessage() {
Genotyping
--run_genotyping [bool] Turn on genotyping of BAM files.
--genotyping_tool [str] Specify which genotyper to use either GATK UnifiedGenotyper, GATK HaplotypeCaller, Freebayes, or pileupCaller. Options: 'ug', 'hc', 'freebayes', 'pileupcaller', 'angsd'.
- --genotyping_source [str] Specify which input BAM to use for genotyping. Options: 'raw', 'trimmed' or 'pmd'. Default: '${params.genotyping_source}'
+ --genotyping_source [str] Specify which input BAM to use for genotyping. Options: 'raw', 'trimmed', 'pmd', 'rescaled'. Default: '${params.genotyping_source}'
--gatk_call_conf [num] Specify GATK phred-scaled confidence threshold. Default: ${params.gatk_call_conf}
--gatk_ploidy [num] Specify GATK organism ploidy. Default: ${params.gatk_ploidy}
--gatk_downsample [num] Maximum depth coverage allowed for genotyping before down-sampling is turned on. Default: ${params.gatk_downsample}
@@ -193,19 +196,21 @@ def helpMessage() {
--contamination_chrom_name [str] The name of the X chromosome in your bam or FASTA header. 'X' for hs37d5, 'chrX' for HG19. Default: '${params.contamination_chrom_name}'
Metagenomic Screening
- --run_metagenomic_screening [bool] Turn on metagenomic screening module for reference-unmapped reads
- --metagenomic_tool [str] Specify which classifier to use. Options: 'malt', 'kraken'. Default: '${params.contamination_chrom_name}'
- --database [dir] Specify path to classifer database directory. For Kraken2 this can also be a `.tar.gz` of the directory.
- --metagenomic_min_support_reads [num] Specify a minimum number of reads a taxon of sample total is required to have to be retained. Not compatible with . Default: ${params.metagenomic_min_support_reads}
- --percent_identity [num] Percent identity value threshold for MALT. Default: ${params.percent_identity}
- --malt_mode [str] Specify which alignment method to use for MALT. Options: 'Unknown', 'BlastN', 'BlastP', 'BlastX', 'Classifier'. Default: '${params.malt_mode}'
- --malt_alignment_mode [str] Specify alignment method for MALT. Options: 'Local', 'SemiGlobal'. Default: '${params.malt_alignment_mode}'
- --malt_top_percent [num] Specify the percent for LCA algorithm for MALT (see MEGAN6 CE manual). Default: ${params.malt_top_percent}
- --malt_min_support_mode [str] Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained for MALT. Options: 'percent', 'reads'. Default: '${params.malt_min_support_mode}'
- --malt_min_support_percent [num] Specify the minimum percentage of reads a taxon of sample total is required to have to be retained for MALT. Default: Default: ${params.malt_min_support_percent}
- --malt_max_queries [num] Specify the maximium number of queries a read can have for MALT. Default: ${params.malt_max_queries}
- --malt_memory_mode [str] Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow. Options: 'load', 'page', 'map'. Default: '${params.malt_memory_mode}'
- --malt_sam_output [bool] Specify to also produce SAM alignment files. Note this includes both aligned and unaligned reads, and are gzipped. Note this will result in very large file sizes.
+ --metagenomic_complexity_filter Turn on removal of low-sequence complexity reads for metagenomic screening with bbduk.
+ --metagenomic_complexity_entropy Specify the entropy threshold that under which a sequencing read will be complexity filtered out. This should be between 0-1. Default: '${params.metagenomic_complexity_entropy}'
+ --run_metagenomic_screening [bool] Turn on metagenomic screening module for reference-unmapped reads
+ --metagenomic_tool [str] Specify which classifier to use. Options: 'malt', 'kraken'. Default: '${params.contamination_chrom_name}'
+ --database [dir] Specify path to classifer database directory. For Kraken2 this can also be a `.tar.gz` of the directory.
+ --metagenomic_min_support_reads [num] Specify a minimum number of reads a taxon of sample total is required to have to be retained. Not compatible with . Default: ${params.metagenomic_min_support_reads}
+ --percent_identity [num] Percent identity value threshold for MALT. Default: ${params.percent_identity}
+ --malt_mode [str] Specify which alignment method to use for MALT. Options: 'Unknown', 'BlastN', 'BlastP', 'BlastX', 'Classifier'. Default: '${params.malt_mode}'
+ --malt_alignment_mode [str] Specify alignment method for MALT. Options: 'Local', 'SemiGlobal'. Default: '${params.malt_alignment_mode}'
+ --malt_top_percent [num] Specify the percent for LCA algorithm for MALT (see MEGAN6 CE manual). Default: ${params.malt_top_percent}
+ --malt_min_support_mode [str] Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained for MALT. Options: 'percent', 'reads'. Default: '${params.malt_min_support_mode}'
+ --malt_min_support_percent [num] Specify the minimum percentage of reads a taxon of sample total is required to have to be retained for MALT. Default: Default: ${params.malt_min_support_percent}
+ --malt_max_queries [num] Specify the maximium number of queries a read can have for MALT. Default: ${params.malt_max_queries}
+ --malt_memory_mode [str] Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow. Options: 'load', 'page', 'map'. Default: '${params.malt_memory_mode}'
+ --malt_sam_output [bool] Specify to also produce SAM alignment files. Note this includes both aligned and unaligned reads, and are gzipped. Note this will result in very large file sizes.
Metagenomic Authentication
--run_maltextract [bool] Turn on MaltExtract for MALT aDNA characteristics authentication
@@ -285,7 +290,7 @@ if("${params.fasta}".endsWith(".gz")){
path zipped_fasta from file(params.fasta) // path doesn't like it if a string of an object is not prefaced with a root dir (/), so use file() to resolve string before parsing to `path`
output:
- path "$unzip" into ch_fasta into ch_fasta_for_bwaindex,ch_fasta_for_bt2index,ch_fasta_for_faidx,ch_fasta_for_seqdict,ch_fasta_for_circulargenerator,ch_fasta_for_circularmapper,ch_fasta_for_damageprofiler,ch_fasta_for_qualimap,ch_fasta_for_pmdtools,ch_fasta_for_genotyping_ug,ch_fasta_for_genotyping_hc,ch_fasta_for_genotyping_freebayes,ch_fasta_for_genotyping_pileupcaller,ch_fasta_for_vcf2genome,ch_fasta_for_multivcfanalyzer,ch_fasta_for_genotyping_angsd
+ path "$unzip" into ch_fasta into ch_fasta_for_bwaindex,ch_fasta_for_bt2index,ch_fasta_for_faidx,ch_fasta_for_seqdict,ch_fasta_for_circulargenerator,ch_fasta_for_circularmapper,ch_fasta_for_damageprofiler,ch_fasta_for_qualimap,ch_fasta_for_pmdtools,ch_fasta_for_genotyping_ug,ch_fasta_for_genotyping_hc,ch_fasta_for_genotyping_freebayes,ch_fasta_for_genotyping_pileupcaller,ch_fasta_for_vcf2genome,ch_fasta_for_multivcfanalyzer,ch_fasta_for_genotyping_angsd,ch_fasta_for_damagerescaling
script:
unzip = zipped_fasta.toString() - '.gz'
@@ -296,7 +301,7 @@ if("${params.fasta}".endsWith(".gz")){
} else {
fasta_for_indexing = Channel
.fromPath("${params.fasta}", checkIfExists: true)
- .into{ ch_fasta_for_bwaindex; ch_fasta_for_bt2index; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_genotyping_pileupcaller; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer;ch_fasta_for_genotyping_angsd }
+ .into{ ch_fasta_for_bwaindex; ch_fasta_for_bt2index; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_genotyping_pileupcaller; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer;ch_fasta_for_genotyping_angsd;ch_fasta_for_damagerescaling }
}
// Check that fasta index file path ends in '.fai'
@@ -413,8 +418,13 @@ if (params.sexdeterrmine_bedfile == '') {
// Genotyping validation
if (params.run_genotyping){
+
+ if (params.genotyping_source != 'raw' && params.genotyping_source != 'pmd' && params.genotyping_source != 'trimmed' && params.genotyping_source != 'rescaled' ) {
+ exit 1, "[nf-core/eager] error: please specify a valid genotyping source. Options: 'raw', 'pmd', 'trimmed', 'rescaled'. Found parameter: --genotyping_source '${params.genotyping_source}'."
+ }
+
if (params.genotyping_tool != 'ug' && params.genotyping_tool != 'hc' && params.genotyping_tool != 'freebayes' && params.genotyping_tool != 'pileupcaller' && params.genotyping_tool != 'angsd' ) {
- exit 1, "[nf-core/eager] error: please specify a genotyper. Options: 'ug', 'hc', 'freebayes', 'pileupcaller'. Found parameter: --genotyping_tool '${params.genotyping_tool}'."
+ exit 1, "[nf-core/eager] error: please specify a valid genotyper. Options: 'ug', 'hc', 'freebayes', 'pileupcaller'. Found parameter: --genotyping_tool '${params.genotyping_tool}'."
}
if (params.gatk_ug_out_mode != 'EMIT_VARIANTS_ONLY' && params.gatk_ug_out_mode != 'EMIT_ALL_CONFIDENT_SITES' && params.gatk_ug_out_mode != 'EMIT_ALL_SITES') {
@@ -506,6 +516,7 @@ if (params.run_multivcfanalyzer) {
}
// Metagenomic validation
+
if (params.run_metagenomic_screening) {
if ( params.bam_unmapped_type == "discard" ) {
exit 1, "[nf-core/eager] error: metagenomic classification can only run on unmapped reads. Please supply --bam_unmapped_type 'fastq'. Supplied: --bam_unmapped_type '${params.bam_unmapped_type}'."
@@ -808,6 +819,8 @@ Channel.from(summary.collect{ [it.key, it.value] })
""".stripIndent() }
.set { ch_workflow_summary }
+log.info "Schaffa, Schaffa, Genome Baua!"
+
///////////////////////////////////////////////////
/* -- REFERENCE FASTA INDEXING -- */
///////////////////////////////////////////////////
@@ -1008,7 +1021,7 @@ process indexinputbam {
// Raw sequencing QC - allow user evaluate if sequencing any good?
process fastqc {
- label 'sc_small'
+ label 'mc_small'
tag "${libraryid}_L${lane}"
publishDir "${params.outdir}/fastqc/input_fastq", mode: params.publish_dir_mode,
saveAs: { filename ->
@@ -1079,7 +1092,7 @@ process fastp {
"""
} else {
"""
- fastp --in1 ${r1} --in2 ${r2} --out1 "${r1.baseName}.pG.fq.gz" --out2 "${r2.baseName}.pG.fq.gz" -A -g --poly_g_min_len "${params.complexity_filter_poly_g_min}" -Q -L -w ${task.cpus} --json "${libraryid}"_L${lane}_fastp.json
+ fastp --in1 ${r1} --in2 ${r2} --out1 "${r1.baseName}.pG.fq.gz" --out2 "${r2.baseName}.pG.fq.gz" -A -g --poly_g_min_len "${params.complexity_filter_poly_g_min}" -Q -L -w ${task.cpus} --json "${libraryid}"_L${lane}_polyg_fastp.json
"""
}
}
@@ -1422,7 +1435,7 @@ process lanemerge_hostremoval_fastq {
// Post-preprocessing QC to help user check pre-processing removed all sequencing artefacts
process fastqc_after_clipping {
- label 'sc_small'
+ label 'mc_small'
tag "${libraryid}_L${lane}"
publishDir "${params.outdir}/fastqc/after_clipping", mode: params.publish_dir_mode,
saveAs: { filename ->
@@ -1541,6 +1554,7 @@ process circulargenerator{
else null
}
+
input:
file fasta from ch_fasta_for_circulargenerator
@@ -1578,7 +1592,7 @@ process circularmapper{
params.mapper == 'circularmapper'
script:
- def filter = params.circularfilter ? '' : '-f true -x false'
+ def filter = params.circularfilter ? '-f true -x true' : ''
def elongated_root = "${fasta.baseName}_${params.circularextension}.fasta"
def size = params.large_ref ? '-c' : ''
@@ -1853,7 +1867,7 @@ process samtools_filter {
output:
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*filtered.bam"), file("*.{bai,csi}") into ch_output_from_filtering
- tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.unmapped.fastq.gz") optional true into ch_bam_filtering_for_metagenomic
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.unmapped.fastq.gz") optional true into ch_bam_filtering_for_metagenomic,ch_metagenomic_for_skipentropyfilter
tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.unmapped.bam") optional true
// Using shell block rather than script because we are playing with awk
@@ -2172,11 +2186,11 @@ process library_merge {
if (!params.skip_deduplication) {
ch_input_for_skiplibrarymerging.mix(ch_output_from_librarymerging)
.filter { it =~/.*_rmdup.bam/ }
- .into { ch_rmdup_for_skipdamagemanipulation; ch_rmdup_for_pmdtools; ch_rmdup_for_bamutils; ch_rmdup_for_bedtools }
+ .into { ch_rmdup_for_skipdamagemanipulation; ch_rmdup_for_pmdtools; ch_rmdup_for_bamutils; ch_rmdup_for_bedtools; ch_rmdup_for_damagerescaling }
} else {
ch_input_for_skiplibrarymerging.mix(ch_output_from_librarymerging)
- .into { ch_rmdup_for_skipdamagemanipulation; ch_rmdup_for_pmdtools; ch_rmdup_for_bamutils; ch_rmdup_for_bedtools }
+ .into { ch_rmdup_for_skipdamagemanipulation; ch_rmdup_for_pmdtools; ch_rmdup_for_bamutils; ch_rmdup_for_bedtools; ch_rmdup_for_damagerescaling }
}
//////////////////////////////////////////////////
@@ -2282,7 +2296,37 @@ process damageprofiler {
"""
}
-// Optionally perform further aDNA evaluation or filtering for just reads with damage etc.
+// Damage rescaling with mapDamage
+
+process mapdamage_rescaling {
+
+ label 'sc_small'
+ tag "${libraryid}"
+
+ publishDir "${params.outdir}/damage_rescaling", mode: params.publish_dir_mode
+
+ when:
+ params.run_mapdamage_rescaling
+
+ input:
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(bam), path(bai) from ch_rmdup_for_damagerescaling
+ file fasta from ch_fasta_for_damagerescaling.collect()
+
+ output:
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*_rescaled.bam"), path("*rescaled.bam.{bai,csi}") into ch_output_from_damagerescaling
+
+ script:
+ def base = "${bam.baseName}"
+ def singlestranded = strandedness == "single" ? '--single-stranded' : ''
+ def size = params.large_ref ? '-c' : ''
+ """
+ mapDamage -i ${bam} -r ${fasta} --rescale --rescale-out ${bam}_rescaled.bam --rescale-length-5p ${params.rescale_length_5p} --rescale-length-3p=${params.rescale_length_3p} ${singlestranded}
+ samtools index ${bam}_rescaled.bam ${size}
+ """
+
+}
+
+// Optionally perform further aDNA evaluation or filtering for just reads with damage etc.
process pmdtools {
label 'mc_small'
@@ -2369,7 +2413,7 @@ process bam_trim {
"""
}
-// Post trimming merging of libraries to single samples, except for SS/DS
+// Post-trimming merging of libraries to single samples, except for SS/DS
// libraries as they should be genotyped separately, because we will assume
// that if trimming is turned on, 'lab-removed' libraries can be combined with
// merged with 'in-silico damage removed' libraries to improve genotyping
@@ -2443,7 +2487,7 @@ process qualimap {
script:
def snpcap = params.snpcapture_bed != '' ? "-gff ${params.snpcapture_bed}" : ''
"""
- qualimap bamqc -bam $bam -nt ${task.cpus} -outdir . -outformat "HTML" ${snpcap}
+ qualimap bamqc -bam $bam -nt ${task.cpus} -outdir . -outformat "HTML" ${snpcap} --java-mem-size=${task.memory.toGiga()}G
"""
}
@@ -2464,12 +2508,19 @@ if ( params.run_genotyping && params.genotyping_source == 'raw' ) {
.into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes; ch_damagemanipulation_for_genotyping_pileupcaller; ch_damagemanipulation_for_genotyping_angsd }
} else if ( params.run_genotyping && params.genotyping_source == "pmd" && !params.run_pmdtools ) {
- exit 1, "[nf-core/eager] error: Cannot run genotyping with 'pmd' source without running pmtools (--run_pmdtools)! Please check input parameters."
+ exit 1, "[nf-core/eager] error: Cannot run genotyping with 'pmd' source without running pmdtools (--run_pmdtools)! Please check input parameters."
} else if ( params.run_genotyping && params.genotyping_source == "pmd" && params.run_pmdtools ) {
ch_output_from_pmdtools
.into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes; ch_damagemanipulation_for_genotyping_pileupcaller; ch_damagemanipulation_for_genotyping_angsd }
+} else if ( params.run_genotyping && params.genotyping_source == "rescaled" && params.run_mapdamage_rescaling) {
+ ch_output_from_damagerescaling
+ .into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes; ch_damagemanipulation_for_genotyping_pileupcaller; ch_damagemanipulation_for_genotyping_angsd }
+
+} else if ( params.run_genotyping && params.genotyping_source == "rescaled" && !params.run_mapdamage_rescaling) {
+ exit 1, "[nf-core/eager] error: Cannot run genotyping with 'rescaled' source without running damage rescaling (--run_damagescaling)! Please check input parameters."
+
} else if ( !params.run_genotyping && !params.run_trim_bam && !params.run_pmdtools ) {
ch_rmdup_for_skipdamagemanipulation
.into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes; ch_damagemanipulation_for_genotyping_pileupcaller; ch_damagemanipulation_for_genotyping_angsd }
@@ -2640,36 +2691,36 @@ process genotyping_pileupcaller {
"""
samtools mpileup -B -q 30 -Q 30 ${use_bed} -f ${fasta} ${bam_list} | pileupCaller ${caller} ${ssmode} ${transitions_mode} --sampleNames ${sample_names} ${use_snp} -e pileupcaller.${strandedness}
"""
- }
-
+}
+
process eigenstrat_snp_coverage {
- label 'mc_tiny'
- tag "${strandedness}"
- publishDir "${params.outdir}/genotyping", mode: params.publish_dir_mode
-
- when:
- params.run_genotyping && params.genotyping_tool == 'pileupcaller'
-
- input:
- tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*") from ch_for_eigenstrat_snp_coverage.dump()
-
- output:
- tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*.json") into ch_eigenstrat_snp_cov_for_multiqc
- path("*_eigenstrat_coverage.txt")
-
- script:
- /*
- The following code block can be swapped in once the eigenstratdatabasetools MultiQC module becomes available.
- """
- eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt -j ${strandedness}_eigenstrat_coverage_mqc.json
- """
- */
- """
- eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt
- parse_snp_cov.py ${strandedness}_eigenstrat_coverage.txt
- """
- }
-
+ label 'mc_tiny'
+ tag "${strandedness}"
+ publishDir "${params.outdir}/genotyping", mode: params.publish_dir_mode
+
+ when:
+ params.run_genotyping && params.genotyping_tool == 'pileupcaller'
+
+ input:
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*") from ch_for_eigenstrat_snp_coverage.dump(tag:'eigenstrat_input')
+
+ output:
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*.json") into ch_eigenstrat_snp_cov_for_multiqc
+ path("*_eigenstrat_coverage.txt")
+
+ script:
+ /*
+ The following code block can be swapped in once the eigenstratdatabasetools MultiQC module becomes available.
+ """
+ eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt -j ${strandedness}_eigenstrat_coverage_mqc.json
+ """
+ */
+ """
+ eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt
+ parse_snp_cov.py ${strandedness}_eigenstrat_coverage.txt
+ """
+}
+
process genotyping_angsd {
label 'mc_small'
tag "${samplename}"
@@ -2897,22 +2948,57 @@ process print_nuclear_contamination{
/* -- METAGENOMICS-SPECIFIC ADDITIONAL STEPS -- */
/////////////////////////////////////////////////////////
+// Low entropy read filter to reduce input sequences of reads that are highly uninformative, and thus reduce runtime/false positives
+
+process metagenomic_complexity_filter {
+ label 'mc_small'
+ tag "${samplename}"
+ publishDir "${params.outdir}/metagenomic_complexity_filter/", mode: params.publish_dir_mode
+
+ when:
+ params.metagenomic_complexity_filter
+
+ input:
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(fastq) from ch_bam_filtering_for_metagenomic
+
+
+ output:
+ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*_lowcomplexityremoved.fq.gz") into ch_lowcomplexityfiltered_for_metagenomic
+ path("*_bbduk.stats") into ch_metagenomic_complexity_filter_for_multiqc
+
+ script:
+ """
+ bbduk.sh -Xmx${task.memory.toGiga()}g in=${fastq} threads=${task.cpus} entropymask=f entropy=${params.metagenomic_complexity_entropy} out=${fastq}_lowcomplexityremoved.fq.gz 2> ${fastq}_bbduk.stats
+ """
+
+}
+
+// metagenomic complexity filter bypass
+
+if ( params.metagenomic_complexity_filter ) {
+ ch_lowcomplexityfiltered_for_metagenomic
+ .set{ ch_filtered_for_metagenomic }
+} else {
+ ch_metagenomic_for_skipentropyfilter
+ .set{ ch_filtered_for_metagenomic }
+}
+
// MALT is a super-fast BLAST replacement typically used for pathogen detection or microbiome profiling against large databases, here using off-target reads from mapping
// As we collect all files for a all metagenomic runs, we DO NOT use the normal input/output tuple!
if (params.metagenomic_tool == 'malt') {
- ch_bam_filtering_for_metagenomic
- .set {ch_bam_filtering_for_metagenomic_malt}
+ ch_filtered_for_metagenomic
+ .set {ch_input_for_metagenomic_malt}
- ch_bam_filtering_for_metagenomic_kraken = Channel.empty()
+ ch_input_for_metagenomic_kraken = Channel.empty()
} else if (params.metagenomic_tool == 'kraken') {
- ch_bam_filtering_for_metagenomic
- .set {ch_bam_filtering_for_metagenomic_kraken}
+ ch_filtered_for_metagenomic
+ .set {ch_input_for_metagenomic_kraken}
- ch_bam_filtering_for_metagenomic_malt = Channel.empty()
+ ch_input_for_metagenomic_malt = Channel.empty()
} else if ( params.metagenomic_tool == '' ) {
- ch_bam_filtering_for_metagenomic_malt = Channel.empty()
- ch_bam_filtering_for_metagenomic_kraken = Channel.empty()
+ ch_input_for_metagenomic_malt = Channel.empty()
+ ch_input_for_metagenomic_kraken = Channel.empty()
}
@@ -2925,7 +3011,7 @@ process malt {
params.run_metagenomic_screening && params.run_bam_filtering && params.bam_unmapped_type == 'fastq' && params.metagenomic_tool == 'malt'
input:
- file fastqs from ch_bam_filtering_for_metagenomic_malt.map { it[7] }.collect()
+ file fastqs from ch_input_for_metagenomic_malt.map { it[7] }.collect()
file db from ch_db_for_malt
output:
@@ -3043,7 +3129,7 @@ process kraken {
params.run_metagenomic_screening && params.run_bam_filtering && params.bam_unmapped_type == 'fastq' && params.metagenomic_tool == 'kraken'
input:
- path(fastq) from ch_bam_filtering_for_metagenomic_kraken.map { it[7] }
+ path(fastq) from ch_input_for_metagenomic_kraken.map { it[7] }
path(krakendb) from ch_krakendb
output:
@@ -3165,6 +3251,8 @@ process get_software_versions {
pileupCaller --version &> v_sequencetools.txt 2>&1 || true
bowtie2 --version | grep -a 'bowtie2-.* -fdebug' > v_bowtie2.txt || true
eigenstrat_snp_coverage --version | cut -d ' ' -f2 >v_eigenstrat_snp_coverage.txt || true
+ mapDamage2 --version > v_mapdamage.txt || true
+ bbduk.sh | grep 'Last modified' | cut -d' ' -f 3-99 > v_bbduk.txt || true
scrape_software_versions.py &> software_versions_mqc.yaml
"""
@@ -3198,6 +3286,7 @@ process multiqc {
file ('mutnucratio/*') from ch_mtnucratio_for_multiqc.collect().ifEmpty([])
file ('endorspy/*') from ch_endorspy_for_multiqc.collect().ifEmpty([])
file ('multivcfanalyzer/*') from ch_multivcfanalyzer_for_multiqc.collect().ifEmpty([])
+ file ('fastp_lowcomplexityfilter/*') from ch_metagenomic_complexity_filter_for_multiqc.collect().ifEmpty([])
file ('malt/*') from ch_malt_for_multiqc.collect().ifEmpty([])
file ('kraken/*') from ch_kraken_for_multiqc.collect().ifEmpty([])
file ('hops/*') from ch_hops_for_multiqc.collect().ifEmpty([])
@@ -3391,7 +3480,7 @@ def checkHostname() {
def extract_data(tsvFile) {
Channel.fromPath(tsvFile)
.splitCsv(header: true, sep: '\t')
- .dump()
+ .dump(tag:'tsv_extract')
.map { row ->
def expected_keys = ['Sample_Name', 'Library_ID', 'Lane', 'Colour_Chemistry', 'SeqType', 'Organism', 'Strandedness', 'UDG_Treatment', 'R1', 'R2', 'BAM']
diff --git a/nextflow.config b/nextflow.config
index c9fcecfe9..8be4e8ad7 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -110,6 +110,10 @@ params {
pmdtools_reference_mask = ''
pmdtools_max_reads = 10000
+ // mapDamage
+ run_mapdamage_rescaling = false
+ params.rescale_length_5p = 12
+ params.rescale_length_3p = 12
//Bedtools settings
run_bedtools_coverage = false
@@ -185,8 +189,12 @@ params {
run_nuclear_contamination = false
contamination_chrom_name = 'X' // Default to using hs37d5 name
- // taxonomic classifer
+ // taxonomic classifier
run_metagenomic_screening = false
+
+ metagenomic_complexity_filter = false
+ metagenomic_complexity_entropy = 0.3
+
metagenomic_tool = ''
database = ''
metagenomic_min_support_reads = 1
@@ -243,7 +251,7 @@ params {
// Container slug. Stable releases should specify release tag!
// Developmental code should specify :dev
-process.container = 'nfcore/eager:2.2.2'
+process.container = 'nfcore/eager:2.3'
// Load base.config by default for all pipelines
includeConfig 'conf/base.config'
@@ -337,7 +345,7 @@ manifest {
description = 'A fully reproducible and state-of-the-art ancient DNA analysis pipeline'
mainScript = 'main.nf'
nextflowVersion = '!>=20.04.0'
- version = '2.2.2'
+ version = '2.3'
}
// Function to ensure that resource requirements don't go beyond
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 6c14dc9c9..bc51825a0 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -555,9 +555,9 @@
},
"circularfilter": {
"type": "boolean",
- "description": "Turn on to filter off-target reads (circularmapper only).",
+ "description": "Turn on to remove reads that did not map to the circularised genome (circularmapper only).",
"fa_icon": "fas fa-filter",
- "help_text": "If you want to filter out reads that don't map to a circular chromosome, turn this on. By default this option is turned off.\n"
+ "help_text": "If you want to filter out reads that don't map to a circular chromosome (and also non-circular chromosome headers) from the resulting BAM file, turn this on. By default this option is turned off.\n> Modifies -f and -x parameters of CircularMapper's realignsamfile\n"
},
"bt2_alignmode": {
"type": "string",
@@ -790,6 +790,26 @@
"description": "Specify the maximum number of reads to consider for metrics generation.",
"fa_icon": "fas fa-greater-than-equal",
"help_text": "The maximum number of reads used for damage assessment in PMDtools. Can be used to significantly reduce the amount of time required for damage assessment in PMDTools. Note that a too low value can also obtain incorrect results.\n\n> Modifies PMDTools parameter: `-n`"
+ },
+ "run_mapdamage_rescaling": {
+ "type": "boolean",
+ "fa_icon": "fas fa-map",
+ "description": "Turn on damage rescaling of BAM files using mapDamage2 to probabilistically remove damage.",
+ "help_text": "Turns on mapDamage2's BAM rescaling functionality. This probablistically replaces Ts back to Cs depending on the likelihood this reference-mismatch was originally caused by damage. If the library is specified to be single stranded, this will automatically use the `--single-stranded` mode.\n\nThis functionality does not have any MultiQC output.\n\n:warning: rescaled libraries will not be merged with non-scaled libraries of the same sample for downstream genotyping, as the model may be different for each library. If you wish to merge these, please do this manually and re-run nf-core/eager using the merged BAMs as input. \n\n> Modifies the `--rescale` parameter of mapDamage2"
+ },
+ "rescale_length_5p": {
+ "type": "integer",
+ "default": 12,
+ "fa_icon": "fas fa-balance-scale-right",
+ "description": "Length of read for mapDamage2 to rescale from 5p end.",
+ "help_text": "Specify the length from the end of the read that mapDamage should rescale.\n\n> Modifies the `--rescale-length-5p` parameter of mapDamage2."
+ },
+ "rescale_length_3p": {
+ "type": "integer",
+ "default": 12,
+ "fa_icon": "fas fa-balance-scale-left",
+ "description": "Length of read for mapDamage2 to rescale from 3p end.",
+ "help_text": "Specify the length from the end of the read that mapDamage should rescale.\n\n> Modifies the `--rescale-length-3p` parameter of mapDamage2."
}
},
"fa_icon": "fas fa-chart-line",
@@ -895,9 +915,15 @@
"genotyping_source": {
"type": "string",
"default": "raw",
- "description": "Specify which input BAM to use for genotyping. Options: 'raw', 'trimmed' or 'pmd'.",
+ "description": "Specify which input BAM to use for genotyping. Options: 'raw', 'trimmed', 'pmd' or 'rescaled'.",
"fa_icon": "fas fa-faucet",
- "help_text": "Indicates which BAM file to use for genotyping, depending on what BAM processing modules you have turned on. Options are: `'raw'` for mapped only, filtered, or DeDup BAMs (with priority right to left); `'trimmed'` (for base clipped BAMs); `'pmd'` (for pmdtools output). Default is: `'raw'`.\n"
+ "help_text": "Indicates which BAM file to use for genotyping, depending on what BAM processing modules you have turned on. Options are: `'raw'` for mapped only, filtered, or DeDup BAMs (with priority right to left); `'trimmed'` (for base clipped BAMs); `'pmd'` (for pmdtools output); `'rescaled'` (for mapDamage2 rescaling output). Default is: `'raw'`.\n",
+ "enum": [
+ "raw",
+ "pmd",
+ "trimmed",
+ "rescaled"
+ ]
},
"gatk_call_conf": {
"type": "integer",
@@ -1291,6 +1317,21 @@
"description": "Options for metagenomic screening of off-target reads.",
"default": "",
"properties": {
+ "metagenomic_complexity_filter": {
+ "type": "boolean",
+ "description": "Turn on removal of low-sequence complexity reads for metagenomic screening with bbduk",
+ "help_text": "Turns on low-sequence complexity filtering of off-target reads using `bbduk`.\n\nThis is typically performed to reduce the number of uninformative reads or potential false-positive reads, typically for input for metagenomic screening. This thus reduces false positive species IDs and also run-time and resource requirements.\n\nSee `--metagenomic_complexity_entropy` for how complexity is calculated. **Important** There are no MultiQC output results for this module, you must check the number of reads removed with the `_bbduk.stats` output file.\n\nDefault: off\n",
+ "fa_icon": "fas fa-filter"
+ },
+ "metagenomic_complexity_entropy": {
+ "type": "number",
+ "default": 0.3,
+ "description": "Specify the entropy threshold that under which a sequencing read will be complexity filtered out. This should be between 0-1.",
+ "minimum": 0,
+ "maximum": 1,
+ "help_text": "Specify a minimum entropy threshold that under which it will be _removed_ from the FASTQ file that goes into metagenomic screening. \n\nA mono-nucleotide read such as GGGGGG will have an entropy of 0, a completely random sequence has an entropy of almost 1.\n\nSee the `bbduk` [documentation](https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/bbduk-guide/-filter) on entropy for more information.\n\n> Modifies`bbduk` parameter `entropy=`",
+ "fa_icon": "fas fa-percent"
+ },
"run_metagenomic_screening": {
"type": "boolean",
"description": "Turn on metagenomic screening module for reference-unmapped reads.",