diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8edacdf46..346449488 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -34,13 +34,13 @@ jobs: - name: Build new docker image if: env.MATCHED_FILES - run: docker build --no-cache . -t nfcore/eager:2.2.2 + run: docker build --no-cache . -t nfcore/eager:2.3 - name: Pull docker image if: ${{ !env.MATCHED_FILES }} run: | docker pull nfcore/eager:dev - docker tag nfcore/eager:dev nfcore/eager:2.2.2 + docker tag nfcore/eager:dev nfcore/eager:2.3 - name: Install Nextflow env: @@ -146,16 +146,16 @@ jobs: nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_pmdtools - name: GENOTYPING_UG AND MULTIVCFANALYZER Test running GATK UnifiedGenotyper and MultiVCFAnalyzer, additional VCFS run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_genotyping --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' --run_multivcfanalyzer --additional_vcf_files 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/vcf/JK2772_CATCAGTGAGTAGA_L008_R1_001.fastq.gz.tengrand.fq.combined.fq.mapped_rmdup.bam.unifiedgenotyper.vcf.gz' --write_allele_frequencies + nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_genotyping --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' --run_multivcfanalyzer --additional_vcf_files 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/vcf/JK2772_CATCAGTGAGTAGA_L008_R1_001.fastq.gz.tengrand.fq.combined.fq.mapped_rmdup.bam.unifiedgenotyper.vcf.gz' --write_allele_frequencies - name: COMPLEX LANE/LIBRARY MERGING Test running lane and library merging prior to GATK UnifiedGenotyper and running MultiVCFAnalyzer run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_complex,docker --run_genotyping --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' --run_multivcfanalyzer + nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_complex,docker --run_genotyping --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' --run_multivcfanalyzer - name: GENOTYPING_UG ON TRIMMED BAM Test run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_genotyping --run_trim_bam --genotyping_source 'trimmed' --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' + nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_genotyping --run_trim_bam --genotyping_source 'trimmed' --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' - name: BAM_INPUT Run the basic pipeline with the bam input profile, skip AdapterRemoval as no convertBam run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_bam,docker --skip_adapterremoval + nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_bam,docker --skip_adapterremoval - name: BAM_INPUT Run the basic pipeline with the bam input profile, convert to FASTQ for adapterremoval test and downstream run: | nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_bam,docker --run_convertinputbam @@ -167,6 +167,9 @@ jobs: - name: METAGENOMIC Run the basic pipeline but with unmapped reads going into MALT run: | nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt/" --malt_sam_output + - name: METAGENOMIC Run the basic pipeline but low-complexity filtered reads going into MALT + run: | + nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt/" --metagenomic_complexity_filter - name: MALTEXTRACT Download resource files run: | mkdir -p databases/maltextract @@ -186,34 +189,6 @@ jobs: - name: MTNUCRATIO Run basic pipeline with bam input profile, but don't convert BAM, skip everything but nmtnucratio run: | nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_humanbam,docker --skip_fastqc --skip_adapterremoval --skip_deduplication --skip_qualimap --skip_preseq --skip_damage_calculation --run_mtnucratio - - push_dockerhub: - name: Push new Docker image to Docker Hub - runs-on: ubuntu-latest - # Only run if the tests passed - needs: test - # Only run for the nf-core repo, for releases and merged PRs - if: ${{ github.repository == 'nf-core/eager' && (github.event_name == 'release' || github.event_name == 'push') }} - env: - DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} - DOCKERHUB_PASS: ${{ secrets.DOCKERHUB_PASS }} - steps: - - name: Check out pipeline code - uses: actions/checkout@v2 - - - name: Build new docker image - run: docker build --no-cache . -t nfcore/eager:latest - - - name: Push Docker image to DockerHub (dev) - if: ${{ github.event_name == 'push' }} - run: | - echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin - docker tag nfcore/eager:latest nfcore/eager:dev - docker push nfcore/eager:dev - - name: Push Docker image to DockerHub (release) - if: ${{ github.event_name == 'release' }} - run: | - echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin - docker push nfcore/eager:latest - docker tag nfcore/eager:latest nfcore/eager:${{ github.ref }} - docker push nfcore/eager:${{ github.ref }} + - name: RESCALING Run basic pipeline with basic pipeline but with mapDamage rescaling of BAM files. Note this will be slow + run: | + nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_mapdamage_rescaling --run_genotyping --genotyping_tool hc --genotyping_source 'rescaled' \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 39aba1bc7..e30265ed1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,30 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## [2.3.0] - 2021-01-11 - "Aalen" + +### `Added` + +- [#640](https://github.com/nf-core/eager/issues/640) - Added a pre-metagenomic screening filtering of low-sequence complexity reads with `bbduk` +- [#583](https://github.com/nf-core/eager/issues/583) - Added `mapDamage2` rescaling of BAM files to remove damage +- Updated usage (merging files) and workflow images reflecting new functionality. + +### `Fixed` + +- Removed leftover old DockerHub push CI commands. +- [#627](https://github.com/nf-core/eager/issues/627) - Added de Barros Damgaard citation to README +- [#630](https://github.com/nf-core/eager/pull/630) - Better handling of Qualimap memory requirements and error strategy. +- Fixed some incomplete schema options to ensure users supply valid input values +- [#638](https://github.com/nf-core/eager/issues/638#issuecomment-748877567) Fixed inverted circularfilter filtering (previously filtering would happen by default, not when requested by user as originally recorded in documentation) +- [DeDup:](https://github.com/apeltzer/DeDup/commit/07d47868f10a6830da8c9161caa3755d9da155bf) Fixed Null Pointer Bug in DeDup by updating to 0.12.8 version +- [#650](https://github.com/nf-core/eager/pull/650) - Increased memory given to FastQC for larger files by making it multithreaded + +### `Dependencies` + +- Update: DeDup v0.12.7 to v0.12.8 + +### `Deprecated` + ## [2.2.2] - 2020-12-09 ### `Added` diff --git a/Dockerfile b/Dockerfile index b9d2d771d..4e8aad0c8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,10 +7,10 @@ COPY environment.yml / RUN conda env create --quiet -f /environment.yml && conda clean -a # Add conda installation dir to PATH (instead of doing 'conda activate') -ENV PATH /opt/conda/envs/nf-core-eager-2.2.2/bin:$PATH +ENV PATH /opt/conda/envs/nf-core-eager-2.3/bin:$PATH # Dump the details of the installed packages to a file for posterity -RUN conda env export --name nf-core-eager-2.2.2 > nf-core-eager-2.2.2.yml +RUN conda env export --name nf-core-eager-2.3 > nf-core-eager-2.3.yml # Instruct R processes to use these empty files instead of clashing with a local version RUN touch .Rprofile diff --git a/README.md b/README.md index 0a9675aed..67daebd3d 100644 --- a/README.md +++ b/README.md @@ -22,10 +22,42 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker containers making installation trivial and results highly reproducible. The pipeline pre-processes raw data from FASTQ inputs, or preprocessed BAM inputs. It can align reads and performs extensive general NGS and aDNA specific quality-control on the results. It comes with docker, singularity or conda containers making installation trivial and results highly reproducible.

- nf-core/eager schematic workflow -## Pipeline steps +## Quick Start + +1. Install [`nextflow`](https://nf-co.re/usage/installation) (version >= 20.04.0) + +2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) or [`Podman`](https://podman.io/) for full pipeline reproducibility _(please only use [`Conda`](https://conda.io/miniconda.html) as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_ + +3. Download the pipeline and test it on a minimal dataset with a single command: + + ```bash + nextflow run nf-core/eager -profile test_tsv, + ``` + + > Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. + +4. Start running your own analysis! + + ```bash + nextflow run nf-core/eager -profile --input '*_R{1,2}.fastq.gz' --fasta '.fasta' + ``` + +5. Once your run has completed successfully, clean up the intermediate files. + + ```bash + nextflow clean -f -k + ``` + +See [usage docs](https://nf-co.re/eager/docs/usage.md) for all of the available options when running the pipeline. + +**N.B.** You can see an overview of the run in the MultiQC report located at `./results/MultiQC/multiqc_report.html` + +Modifications to the default pipeline are easily made using various options as described in the documentation. + +## Pipeline Summary ### Default Steps @@ -77,6 +109,7 @@ Additional functionality contained by the pipeline currently includes: #### Metagenomic Screening +* Low-sequenced complexity filtering (`BBduk`) * Taxonomic binner with alignment (`MALT`) * Taxonomic binner without alignment (`Kraken2`) * aDNA characteristic screening of taxonomically binned data from MALT (`MaltExtract`) @@ -86,51 +119,9 @@ Additional functionality contained by the pipeline currently includes: A graphical overview of suggested routes through the pipeline depending on context can be seen below.

- nf-core/eager metro map -## Quick Start - -1. Install [`nextflow`](https://nf-co.re/usage/installation) (version >= 20.04.0) - -2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) or [`Podman`](https://podman.io/) for full pipeline reproducibility _(please only use [`Conda`](https://conda.io/miniconda.html) as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_ - -3. Download the pipeline and test it on a minimal dataset with a single command: - - ```bash - nextflow run nf-core/eager -profile test, - ``` - - > Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. - -4. Start running your own analysis! - - ```bash - nextflow run nf-core/eager -profile --input '*_R{1,2}.fastq.gz' --fasta '.fasta' - ``` - -5. Once your run has completed successfully, clean up the intermediate files. - - ```bash - nextflow clean -f -k - ``` - -See [usage docs](https://nf-co.re/eager/docs/usage.md) for all of the available options when running the pipeline. - -**N.B.** You can see an overview of the run in the MultiQC report located at `./results/MultiQC/multiqc_report.html` - -Modifications to the default pipeline are easily made using various options -as described in the documentation. - -## Pipeline Summary - -By default, the pipeline currently performs the following: - - - -* Sequencing quality control (`FastQC`) -* Overall pipeline run summaries (`MultiQC`) - ## Documentation The nf-core/eager pipeline comes with documentation about the pipeline: [usage](https://nf-co.re/eager/usage) and [output](https://nf-co.re/eager/output). @@ -236,6 +227,8 @@ In addition, references of tools and data used in this pipeline are as follows: * **Bowtie2** Langmead, B. and Salzberg, S. L. 2012 Fast gapped-read alignment with Bowtie 2. Nature methods, 9(4), p. 357–359. doi: [10.1038/nmeth.1923](https:/dx.doi.org/10.1038/nmeth.1923). * **sequenceTools** Stephan Schiffels (Unpublished). Download: [https://github.com/stschiff/sequenceTools](https://github.com/stschiff/sequenceTools) * **EigenstratDatabaseTools** Thiseas C. Lamnidis (Unpublished). Download: [https://github.com/TCLamnidis/EigenStratDatabaseTools.git](https://github.com/TCLamnidis/EigenStratDatabaseTools.git) +* **mapDamage2** Jónsson, H., et al 2013. mapDamage2.0: fast approximate Bayesian estimates of ancient DNA damage parameters. Bioinformatics , 29(13), 1682–1684. [https://doi.org/10.1093/bioinformatics/btt193](https://doi.org/10.1093/bioinformatics/btt193) +* **BBduk** Brian Bushnell (Unpublished). Download: [https://sourceforge.net/projects/bbmap/](sourceforge.net/projects/bbmap/) ## Data References @@ -244,3 +237,4 @@ This repository uses test data from the following studies: * Fellows Yates, J. A. et al. (2017) ‘Central European Woolly Mammoth Population Dynamics: Insights from Late Pleistocene Mitochondrial Genomes’, Scientific reports, 7(1), p. 17714. [doi: 10.1038/s41598-017-17723-1](https://doi.org/10.1038/s41598-017-17723-1). * Gamba, C. et al. (2014) ‘Genome flux and stasis in a five millennium transect of European prehistory’, Nature communications, 5, p. 5257. [doi: 10.1038/ncomms6257](https://doi.org/10.1038/ncomms6257). * Star, B. et al. (2017) ‘Ancient DNA reveals the Arctic origin of Viking Age cod from Haithabu, Germany’, Proceedings of the National Academy of Sciences of the United States of America, 114(34), pp. 9152–9157. [doi: 10.1073/pnas.1710186114](https://doi.org/10.1073/pnas.1710186114). +* de Barros Damgaard, P. et al. (2018). '137 ancient human genomes from across the Eurasian steppes.', Nature, 557(7705), 369–374. [doi: 10.1038/s41586-018-0094-2](https://doi.org/10.1038/s41586-018-0094-2) diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index 7fc6cabd5..c105fcb4e 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -6,7 +6,6 @@ report_comment: > This report has been generated by the nf-core/eager analysis pipeline. For information about how to interpret these results, please see the documentation. - run_modules: - adapterRemoval - bowtie2 @@ -270,4 +269,4 @@ report_section_order: nf-core-eager-summary: order: -1001 -export_plots: true +export_plots: true \ No newline at end of file diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index 201df4a58..5c9c0da9c 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -35,7 +35,9 @@ 'VCF2genome':['v_vcf2genome.txt', r"VCF2Genome \(v. ([0-9].[0-9]+) "], 'endorS.py':['v_endorSpy.txt', r"endorS.py (\S+)"], 'kraken':['v_kraken.txt', r"Kraken version (\S+)"], - 'eigenstrat_snp_coverage':['v_eigenstrat_snp_coverage.txt',r"(\S+)"] + 'eigenstrat_snp_coverage':['v_eigenstrat_snp_coverage.txt',r"(\S+)"], + 'mapDamage2':['v_mapdamage.txt',r"(\S+)"], + 'bbduk':['v_bbduk.txt',r"(\S+)"] } results = OrderedDict() @@ -55,7 +57,7 @@ results['Qualimap'] = 'N/A' results['Preseq'] = 'N/A' results['GATK HaplotypeCaller'] = 'N/A' -#results['GATK UnifiedGenotyper'] = 'N/A' +results['GATK UnifiedGenotyper'] = 'N/A' results['freebayes'] = 'N/A' results['sequenceTools'] = 'N/A' results['VCF2genome'] = 'N/A' @@ -71,6 +73,8 @@ results['kraken'] = 'N/A' results['maltextract'] = 'N/A' results['eigenstrat_snp_coverage'] = 'N/A' +results['mapDamage2'] = 'N/A' +results['bbduk'] = 'N/A' # Search each file using its regex for k, v in regexes.items(): diff --git a/conf/base.config b/conf/base.config index dc58944ba..8266b9c72 100644 --- a/conf/base.config +++ b/conf/base.config @@ -74,7 +74,7 @@ process { } withName:qualimap{ - errorStrategy = 'ignore' + errorStrategy = { task.exitStatus in [1,143,137,104,134,139] ? 'retry' : 'finish' } } withName:preseq { diff --git a/conf/test_resources.config b/conf/test_resources.config index 109e93e4c..74bb4ce2a 100644 --- a/conf/test_resources.config +++ b/conf/test_resources.config @@ -51,4 +51,8 @@ process { time = { check_max( 10.m * task.attempt, 'time' ) } } + withName:'mapdamage_rescaling'{ + time = { check_max( 20.m * task.attempt, 'time' ) } + } + } \ No newline at end of file diff --git a/docs/images/output/overview/eager2_metromap_complex.png b/docs/images/output/overview/eager2_metromap_complex.png deleted file mode 100644 index ae866ea96..000000000 Binary files a/docs/images/output/overview/eager2_metromap_complex.png and /dev/null differ diff --git a/docs/images/output/overview/eager2_workflow.png b/docs/images/output/overview/eager2_workflow.png deleted file mode 100644 index 10f10fc06..000000000 Binary files a/docs/images/output/overview/eager2_workflow.png and /dev/null differ diff --git a/docs/images/output/overview/eager2_workflow.svg b/docs/images/output/overview/eager2_workflow.svg deleted file mode 100644 index 940990523..000000000 --- a/docs/images/output/overview/eager2_workflow.svg +++ /dev/null @@ -1,1622 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - fastq - - - - - bam - - - - - tsv - - - (fastq/bam) - - - - - fasta - - - or - or - Preprocessing - Evaluation - - - fastp - - - - AdapterRemoval v2 - - - - FastQC - - - - - - Mapping and postprocessing - Evaluation - - BWA/Bowtie2/CircularMapper - - - - DeDup/Picard - - - - preseq - - SAMtools - - - Qualimap 2 - - - - aDNA evaluationand modification - - - DamageProfiler - - - - bamUtils - - - - ANGSD - - - - PMDtools - - - Statistics - - - Sex.DetERRmine - - - - bedtools - - - - MTNucRatio - - - - Genotyping - - - GATK - - - - VCF2Genome - - - - MultiVCFAnalyzer - - - - FreeBayes - - - - - Metagenomic screening - Host DNAremoval - - - MALT - - - extract_map_reads.py - Kraken2 - Ancient meta-genomic screening - - - HOPS - - - - Indexing/conversion - - - SAMtools - - - - endorS.py - - Consensus calling - Reporting - - - - MultiQC - - - - Files for downstream analyses - - - fastq - - - - - bam - - - - - vcf - - - - - - ANGSD - - - sequenceTools - - - diff --git a/docs/images/usage/eager2_metromap_complex.png b/docs/images/usage/eager2_metromap_complex.png new file mode 100644 index 000000000..244bd76ad Binary files /dev/null and b/docs/images/usage/eager2_metromap_complex.png differ diff --git a/docs/images/output/overview/eager2_metromap_complex.svg b/docs/images/usage/eager2_metromap_complex.svg similarity index 91% rename from docs/images/output/overview/eager2_metromap_complex.svg rename to docs/images/usage/eager2_metromap_complex.svg index 7e636788f..930a940a9 100644 --- a/docs/images/output/overview/eager2_metromap_complex.svg +++ b/docs/images/usage/eager2_metromap_complex.svg @@ -7,14 +7,14 @@ xmlns="http://www.w3.org/2000/svg" xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" - width="297mm" - height="210mm" - viewBox="0 0 297 210" + width="297.05508mm" + height="170.45857mm" + viewBox="0 0 297.05508 170.45857" version="1.1" id="svg8" - inkscape:version="1.0.1 (1.0.1+r73)" - sodipodi:docname="EAGER_lonundmapsvg_complex_fin.svg" - inkscape:export-filename="/home/jfellows/Documents/graphics/diagrams/EAGER_lonundmapsvg_complex_fin.png" + inkscape:version="1.0.1 (1.0.1+r74)" + sodipodi:docname="eager2_metromap_complex_v2.svg" + inkscape:export-filename="/home/jfellows/Documents/graphics/diagrams/eager2_metromap_complex_v2.png" inkscape:export-xdpi="300" inkscape:export-ydpi="300"> + inkscape:snap-nodes="false" + fit-margin-top="0" + fit-margin-left="0" + fit-margin-right="0" + fit-margin-bottom="0"> + dotted="true" + originx="0.32694881" + originy="-15.312377" /> @@ -175,21 +181,22 @@ image/svg+xml - + + id="layer1" + transform="translate(0.32694882,-15.312377)"> + x="-1.7567509" + y="17.763466" /> @@ -558,7 +565,7 @@ sodipodi:nodetypes="csssc" /> @@ -654,18 +661,18 @@ EndorSpy - GATK's UnifiedGenotyperGATK's HaplotypeCallerSequenceTools' PileupCallerFreebayesANGSD PMDToolsBamUtils + x="228.58232" + y="66.459801" + style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:3.175px;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro';text-align:start;text-anchor:start;stroke-width:0.264583" + id="tspan1220">PMDToolsBamUtilsmapDamage2's rescale NuclearContamination(Human) PreSeq MtNucRatio (filtering) samtools' index Kraken Parse MaltExtract MALTKraken (Convert BAM) FastQC fastp AdapterRemoval FastQC + GATK's UnifiedGenotyperGATK's HaplotypeCallersequenceTool's pileupCallerfreebayesANGSD + + BBduk + diff --git a/docs/images/usage/eager2_workflow.png b/docs/images/usage/eager2_workflow.png new file mode 100644 index 000000000..628fbab0a Binary files /dev/null and b/docs/images/usage/eager2_workflow.png differ diff --git a/docs/images/usage/eager2_workflow.svg b/docs/images/usage/eager2_workflow.svg new file mode 100644 index 000000000..b39f52c90 --- /dev/null +++ b/docs/images/usage/eager2_workflow.svg @@ -0,0 +1,1807 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + fastq + + + + bam + + + + tsv + + (fastq/bam) + + + + fasta + + or + or + Preprocessing + Evaluation + + + fastp + + + + AdapterRemoval v2 + + + + FastQC + + + + + + Mapping and postprocessing + Evaluation + + BWA/Bowtie2/CircularMapper + + + + DeDup/MarkDuplicates + + + + preseq + + SAMtools + + + Qualimap 2 + + + + aDNA evaluationand modification + + + DamageProfiler + + + + bamUtils + + + + ANGSD + + + + PMDtools + + + Statistics + + + Sex.DetERRmine + + + + bedtools + + + + MTNucRatio + + + + Genotyping + + + GATK + + + + VCF2Genome + + + + MultiVCFAnalyzer + + + + FreeBayes + + + + + Metagenomic screening + + + MALT + + Host DNAremoval + + strip_fastq + Kraken2 + Ancient meta-genomic screening + + + HOPS + + + + Indexing/conversion + + + SAMtools + + + + endorS.py + + Consensus calling + CC-BY 4.0 + Reporting + + + + MultiQC + + + Files for downstream analyses + + + fastq + + + + bam + + + + vcf + + + + + ANGSD + + + pileupCaller + + + mapDamage2 + + + Complexityfiltering + + + bbduk + + + + diff --git a/docs/images/usage/merging_files.png b/docs/images/usage/merging_files.png index 55a885b9e..72f1c487a 100644 Binary files a/docs/images/usage/merging_files.png and b/docs/images/usage/merging_files.png differ diff --git a/docs/images/usage/merging_files.svg b/docs/images/usage/merging_files.svg index d3f61cc7d..a3c13f2b4 100644 --- a/docs/images/usage/merging_files.svg +++ b/docs/images/usage/merging_files.svg @@ -1,4 +1,6 @@ + + + id="svg8" + sodipodi:docname="merging_files.svg" + inkscape:version="0.92.1 r15371"> - - - - - - + only_selected="false" /> - - - - - + only_selected="false" /> - - - - - + only_selected="false" /> + + + + + weight="33.333333" + steps="2" + helper_size="0" + apply_no_weight="true" + apply_with_weight="true" + only_selected="false" /> + + + + + + weight="33.333333" + steps="2" + helper_size="0" + apply_no_weight="true" + apply_with_weight="true" + only_selected="false" /> + + + + + + weight="33.333333" + steps="2" + helper_size="0" + apply_no_weight="true" + apply_with_weight="true" + only_selected="false" /> - - - + apply_with_weight="true" + only_selected="false" /> - - - + apply_with_weight="true" + only_selected="false" /> - - + simplify_just_coalesce="false" + simplifyindividualpaths="false" + simplifyJustCoalesce="false" /> - + apply_no_weight="true" + apply_with_weight="true" + only_selected="false" /> - - - - - - + simplifyJustCoalesce="false" /> - + apply_no_weight="true" + apply_with_weight="true" + only_selected="false" /> - - - - - - + simplifyJustCoalesce="false" /> - + apply_no_weight="true" + apply_with_weight="true" + only_selected="false" /> - - - - - - + simplifyJustCoalesce="false" /> + + + + steps="1" + threshold="0.00961538" + smooth_angles="360" + helper_size="0" + simplify_individual_paths="false" + simplify_just_coalesce="false" + simplifyindividualpaths="false" + simplifyJustCoalesce="false" /> + + + steps="1" + threshold="0.0043662" + smooth_angles="360" + helper_size="0" + simplify_individual_paths="false" + simplify_just_coalesce="false" + simplifyindividualpaths="false" + simplifyJustCoalesce="false" /> + + steps="1" + threshold="0.0043662" + smooth_angles="360" + helper_size="0" + simplify_individual_paths="false" + simplify_just_coalesce="false" + simplifyindividualpaths="false" + simplifyJustCoalesce="false" /> - + simplify_individual_paths="false" + simplify_just_coalesce="false" + simplifyindividualpaths="false" + simplifyJustCoalesce="false" /> - + simplify_individual_paths="false" + simplify_just_coalesce="false" + simplifyindividualpaths="false" + simplifyJustCoalesce="false" /> - + simplify_individual_paths="false" + simplify_just_coalesce="false" + simplifyindividualpaths="false" + simplifyJustCoalesce="false" /> - + simplify_individual_paths="false" + simplify_just_coalesce="false" + simplifyindividualpaths="false" + simplifyJustCoalesce="false" /> + + steps="1" + threshold="0.0043662" + smooth_angles="360" + helper_size="0" + simplify_individual_paths="false" + simplify_just_coalesce="false" + simplifyindividualpaths="false" + simplifyJustCoalesce="false" /> + + steps="1" + threshold="0.0043662" + smooth_angles="360" + helper_size="0" + simplify_individual_paths="false" + simplify_just_coalesce="false" + simplifyindividualpaths="false" + simplifyJustCoalesce="false" /> + + steps="1" + threshold="0.0043662" + smooth_angles="360" + helper_size="0" + simplify_individual_paths="false" + simplify_just_coalesce="false" + simplifyindividualpaths="false" + simplifyJustCoalesce="false" /> + + steps="1" + threshold="0.000408163" + smooth_angles="360" + helper_size="0" + simplify_individual_paths="false" + simplify_just_coalesce="false" + simplifyindividualpaths="false" + simplifyJustCoalesce="false" /> + + steps="1" + threshold="0.000408163" + smooth_angles="360" + helper_size="0" + simplify_individual_paths="false" + simplify_just_coalesce="false" + simplifyindividualpaths="false" + simplifyJustCoalesce="false" /> + + steps="1" + threshold="0.000408163" + smooth_angles="360" + helper_size="0" + simplify_individual_paths="false" + simplify_just_coalesce="false" + simplifyindividualpaths="false" + simplifyJustCoalesce="false" /> + + steps="1" + threshold="0.000408163" + smooth_angles="360" + helper_size="0" + simplify_individual_paths="false" + simplify_just_coalesce="false" + simplifyindividualpaths="false" + simplifyJustCoalesce="false" /> + + steps="1" + threshold="0.000408163" + smooth_angles="360" + helper_size="0" + simplify_individual_paths="false" + simplify_just_coalesce="false" + simplifyindividualpaths="false" + simplifyJustCoalesce="false" /> - + simplify_individual_paths="false" + simplify_just_coalesce="false" + simplifyindividualpaths="false" + simplifyJustCoalesce="false" /> - + simplify_individual_paths="false" + simplify_just_coalesce="false" + simplifyindividualpaths="false" + simplifyJustCoalesce="false" /> - - - - - + only_selected="false" /> - - - - - + only_selected="false" /> - - - - - + only_selected="false" /> - - - - - + only_selected="false" /> - + + weight="33.333333" + steps="2" + helper_size="0" + apply_no_weight="true" + apply_with_weight="true" + only_selected="false" /> + + + + + + weight="33.333333" + steps="2" + helper_size="0" + apply_no_weight="true" + apply_with_weight="true" + only_selected="false" /> + + + + + + weight="33.333333" + steps="2" + helper_size="0" + apply_no_weight="true" + apply_with_weight="true" + only_selected="false" /> + + + + + + weight="33.333333" + steps="2" + helper_size="0" + apply_no_weight="true" + apply_with_weight="true" + only_selected="false" /> + + + + + + weight="33.333333" + steps="2" + helper_size="0" + apply_no_weight="true" + apply_with_weight="true" + only_selected="false" /> + + + + + + weight="33.333333" + steps="2" + helper_size="0" + apply_no_weight="true" + apply_with_weight="true" + only_selected="false" /> + + + + + + weight="33.333333" + steps="2" + helper_size="0" + apply_no_weight="true" + apply_with_weight="true" + only_selected="false" /> + + + + + + weight="33.333333" + steps="2" + helper_size="0" + apply_no_weight="true" + apply_with_weight="true" + only_selected="false" /> + + + + + + weight="33.333333" + steps="2" + helper_size="0" + apply_no_weight="true" + apply_with_weight="true" + only_selected="false" /> + + + + + + weight="33.333333" + steps="2" + helper_size="0" + apply_no_weight="true" + apply_with_weight="true" + only_selected="false" /> + bordercolor="#666666" + borderopacity="1.0" + inkscape:pageopacity="0.0" + inkscape:pageshadow="2" + inkscape:zoom="0.98994949" + inkscape:cx="196.24187" + inkscape:cy="-18.794667" + inkscape:document-units="mm" + inkscape:current-layer="g5002" + showgrid="false" + inkscape:measure-start="0,0" + inkscape:measure-end="0,0" + inkscape:window-width="1920" + inkscape:window-height="986" + inkscape:window-x="-11" + inkscape:window-y="-11" + inkscape:window-maximized="1"> + type="xygrid" + id="grid4491" /> @@ -1426,788 +1320,1387 @@ + id="layer1"> + y="69.219543" + id="text6552"> + inkscape:export-ydpi="289.40701"> + inkscape:export-ydpi="289.40701"> + x="-8.8817842e-015" + height="89.835548" + width="214.3125" + id="rect5125" + style="opacity:1;fill:#ffffff;fill-opacity:0.95238097;stroke:#ffffff;stroke-width:0.50724828;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" + inkscape:export-xdpi="289.40701" + inkscape:export-ydpi="289.40701" /> poop1_libA_4_PE_full_lane1 + x="9.1932545" + id="tspan4608-4-9" + sodipodi:role="line">sampleA_libraryA_4_PE_UDGfull_lane1 poop1_libA_4_PE_full_lane2 + x="8.4063387" + id="tspan4608-4-9-9" + sodipodi:role="line">sampleA_libraryA_4_PE_UDGfull_lane2 poop1_libB_4_PE_full_lane1 + x="9.5234709" + id="tspan4608-4-9-3" + sodipodi:role="line">sampleA_libraryB_4_PE_UDGfull_lane1 sampleA_libraryB_4_PE_UDGfull_lane2 + sampleA_libraryB_2_PE_UDGfull_lane1 + sampleA_libraryB_4_SE_UDGfull_lane3 + sampleA_libraryC_4_PE_UDGhalf_lane3 + sampleA_libraryD_4_PE_UDGnone_lane3 + + + + + + poop1_libB_4_PE_full_lane2 + inkscape:export-ydpi="289.40701" /> poop1_libB_2_PE_full_lane1 - FastP + poop1_libB_4_SE_full_lane3 - + poop1_libC_4_PE_half_lane3 + inkscape:export-ydpi="289.40701" /> + + poop1_libD_4_PE_none_lane3 + id="text4610-36-7" + y="93.195404" + x="-161.08456" + style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.93888903px;line-height:1.25;font-family:Kalam;-inkscape-font-specification:Kalam;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.1;stroke-miterlimit:4;stroke-dasharray:none" + xml:space="preserve">Preprocessing + inkscape:export-ydpi="289.40701" /> + + - Mapping + + + id="text4610-43-4-1" + y="164.70946" + x="-183.25703" + style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.93888903px;line-height:1.25;font-family:Kalam;-inkscape-font-specification:Kalam;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.1;stroke-miterlimit:4;stroke-dasharray:none" + xml:space="preserve">BAM trim + - + + + + + - + FastP + inkscape:export-xdpi="289.40701" + id="text4610-0-6" + y="192.61006" + x="-158.87413" + style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:4.93888903px;line-height:1.25;font-family:Kalam;-inkscape-font-specification:Kalam;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.1;stroke-miterlimit:4;stroke-dasharray:none" + xml:space="preserve">Genotyping + - + - + + + Preprocessing + inkscape:export-xdpi="289.40701" + style="fill:#d4aa00;stroke:#00aad4;stroke-width:0.69999999;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" + d="m 43.17947,137.54221 h 4.497917" + id="path4786-7-5" + inkscape:connector-curvature="0" /> + style="fill:#d4aa00;stroke:#00aad4;stroke-width:0.69999999;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" + d="m 42.38572,146.80263 h 4.497917" + id="path4786-7-8" + inkscape:connector-curvature="0" /> - + - + + Mapping - + - + + BAM trim + inkscape:export-xdpi="289.40701" + style="fill:#d4aa00;stroke:#214478;stroke-width:0.69999999;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" + d="m 21.483635,128.2818 h 5.820834" + id="path4786-7-0-3" + inkscape:connector-curvature="0" /> - - - - - + - + + Genotyping + id="tspan4673">CC-BY 4.0 + style="fill:#d4aa00;stroke:#b9cdec;stroke-width:0.68313009;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:0.99607843" + d="M 41.539583,174.49792 H 46.83125" + id="path4786-7-0-66-7" + inkscape:connector-curvature="0" /> + + id="path4839" + d="m 128.32292,163.38542 3.96875,-9.26042" + style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" /> + id="path4841" + d="m 101.86458,154.125 3.96875,-9.26042" + style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" /> + style="opacity:1;fill:#d4aa00;fill-opacity:1;stroke:#d4aa00;stroke-width:0.5;stroke-linecap:square;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" + id="path4754-1" + sodipodi:type="arc" + sodipodi:cx="105.83333" + sodipodi:cy="144.86458" + sodipodi:rx="1.3229166" + sodipodi:ry="1.3229166" + sodipodi:start="0" + sodipodi:end="6.2807824" + d="m 107.15624,144.86458 a 1.3229166,1.3229166 0 0 1 -1.32212,1.32291 1.3229166,1.3229166 0 0 1 -1.32371,-1.32132 1.3229166,1.3229166 0 0 1 1.32053,-1.32451 1.3229166,1.3229166 0 0 1 1.3253,1.31974 l -1.32291,0.003 z" /> + + + Deduplication + + + sampleA_libraryA_colchem4_PE_UDGfull_lane1 + sampleA_libraryA_colchem4_PE_UDGfull_lane2 + sampleA_libraryB_colchem4_PE_UDGfull_lane1 + sampleA_libraryB_colchem4_PE_UDGfull_lane2 + sampleA_libraryB_colchem2_PE_UDGfull_lane1 + sampleA_libraryB_colchem4_SE_UDGfull_lane3 + sampleA_libraryC_colchem4_PE_UDGhalf_lane3 + sampleA_libraryD_colchem4_PE_UDGnone_lane3 + + id="path4990-7" + d="m 62.161743,291.94628 h 41.010417 l 3.65179,-4.84845" + style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + sodipodi:nodetypes="ccc" + inkscape:export-xdpi="305.60999" + inkscape:export-ydpi="305.60999" + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" /> + id="path5010-5" + d="m 62.161743,301.2067 h 41.010417 l 4.18094,9.17445" + style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + inkscape:export-xdpi="305.60999" + inkscape:export-ydpi="305.60999" + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" /> + id="path5012-0" + d="m 62.109356,310.38116 45.243744,-1e-5" + style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + inkscape:export-xdpi="305.60999" + inkscape:export-ydpi="305.60999" + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" /> + id="path5014-4" + d="m 62.161743,319.72753 41.222607,-0.086" + style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + inkscape:export-xdpi="305.60999" + inkscape:export-ydpi="305.60999" + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" /> + + FastP + id="path5016-3" + d="M 62.109356,328.90199 H 129.8427" + style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + inkscape:export-xdpi="305.60999" + inkscape:export-ydpi="305.60999" + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" /> + id="path5018-8" + d="m 62.161743,338.24836 112.660107,-0.086" + style="fill:none;stroke:#000000;stroke-width:0.25475112px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + inkscape:export-xdpi="305.60999" + inkscape:export-ydpi="305.60999" + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" /> - + id="path5020-7" + d="m 62.161743,347.50878 112.660107,-0.086" + style="fill:none;stroke:#000000;stroke-width:0.26424816px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + inkscape:export-xdpi="305.60999" + inkscape:export-ydpi="305.60999" + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" /> + + Preprocessing + id="path5022-0" + d="m 106.82395,287.09782 h 49.2125" + style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + inkscape:export-xdpi="305.60999" + inkscape:export-ydpi="305.60999" + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" /> + id="path5024-2" + d="m 107.3531,310.38115 h 22.4896 l 3.96875,9.26042" + style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + inkscape:export-xdpi="305.60999" + inkscape:export-ydpi="305.60999" + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" /> + + Mapping + + BAM trim + id="path5047-8" + d="m 156.03645,287.09782 2.9104,16.66875" + style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + sodipodi:nodetypes="cc" + inkscape:export-xdpi="305.60999" + inkscape:export-ydpi="305.60999" + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" /> + id="path5059-2" + d="m 158.94685,303.76657 h 15.875" + style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + inkscape:export-xdpi="305.60999" + inkscape:export-ydpi="305.60999" + sodipodi:nodetypes="cc" + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" /> + id="path5071-4" + d="m 178.7906,324.93324 h 30.4271" + style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + sodipodi:nodetypes="cc" + inkscape:export-xdpi="305.60999" + inkscape:export-ydpi="305.60999" + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" /> + + + + + + Genotyping + inkscape:export-xdpi="305.60999" + inkscape:export-ydpi="305.60999" + sodipodi:nodetypes="cc" + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" /> + inkscape:export-xdpi="305.60999" + inkscape:export-ydpi="305.60999" + sodipodi:nodetypes="cc" + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" /> + inkscape:path-effect="#path-effect4748-2" + inkscape:original-d="m 174.82185,338.1624 c 1.28735,-4.29051 2.64611,-8.81971 3.96875,-13.22916" + sodipodi:nodetypes="cc" + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" /> + + CC-BY 4.0CC-BY 4.0 - - + id="tspan4662-9" /> + id="path4839-9" + d="m 129.8427,328.90199 3.96875,-9.26042" + style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" + inkscape:export-xdpi="305.60999" + inkscape:export-ydpi="305.60999" /> + id="path4841-3" + d="m 103.38435,319.64157 3.96875,-9.26042" + style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" + inkscape:export-xdpi="305.60999" + inkscape:export-ydpi="305.60999" /> + id="path4754-1-4" + sodipodi:type="arc" + sodipodi:cx="106.82394" + sodipodi:cy="310.38116" + sodipodi:rx="1.3229166" + sodipodi:ry="1.3229166" + sodipodi:start="0" + sodipodi:end="6.2807824" + d="m 108.14685,310.38116 a 1.3229166,1.3229166 0 0 1 -1.32212,1.32292 1.3229166,1.3229166 0 0 1 -1.32371,-1.32133 1.3229166,1.3229166 0 0 1 1.32053,-1.3245 1.3229166,1.3229166 0 0 1 1.3253,1.31974 l -1.32291,0.003 z" + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" /> + inkscape:export-xdpi="305.60999" + inkscape:export-ydpi="305.60999" + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" /> + id="path4873-1" + d="m 133.81145,319.64157 c 7.43379,0 14.99331,0 22.48958,0" + style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" + inkscape:export-xdpi="305.60999" + inkscape:export-ydpi="305.60999" /> + id="path4754-1-5-5" + sodipodi:type="arc" + sodipodi:cx="133.81145" + sodipodi:cy="319.64157" + sodipodi:rx="1.3229166" + sodipodi:ry="1.3229166" + sodipodi:start="0" + sodipodi:end="6.2807824" + d="m 135.13436,319.64157 a 1.3229166,1.3229166 0 0 1 -1.32212,1.32292 1.3229166,1.3229166 0 0 1 -1.32371,-1.32133 1.3229166,1.3229166 0 0 1 1.32053,-1.3245 1.3229166,1.3229166 0 0 1 1.3253,1.31973 l -1.32291,0.003 z" + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" /> + height="76.729164" + x="141.6501" + y="-352.27127" + ry="3.2424433" + transform="scale(1,-1)" + inkscape:export-xdpi="305.60999" + inkscape:export-ydpi="305.60999" + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" /> Deduplication + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png">Deduplication + id="path4754-1-9-9" + sodipodi:type="arc" + sodipodi:cx="158.94684" + sodipodi:cy="303.76657" + sodipodi:rx="1.3229166" + sodipodi:ry="1.3229166" + sodipodi:start="0" + sodipodi:end="6.2807824" + d="m 160.26976,303.76657 a 1.3229166,1.3229166 0 0 1 -1.32213,1.32292 1.3229166,1.3229166 0 0 1 -1.32371,-1.32133 1.3229166,1.3229166 0 0 1 1.32053,-1.3245 1.3229166,1.3229166 0 0 1 1.3253,1.31973 l -1.32291,0.003 z" + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" /> + Merge libraries + Merge UDG treatments + Merge lanes + Merge read- pair types + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" + inkscape:export-xdpi="305.60999" + inkscape:export-ydpi="305.60999" /> + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" + inkscape:export-xdpi="305.60999" + inkscape:export-ydpi="305.60999" /> + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" + inkscape:export-xdpi="305.60999" + inkscape:export-ydpi="305.60999" /> + inkscape:export-filename="C:\Users\fagernaes\Pictures\merging_files_review.png" + inkscape:export-xdpi="305.60999" + inkscape:export-ydpi="305.60999" /> + diff --git a/docs/output.md b/docs/output.md index 7c316e7db..86a710c56 100644 --- a/docs/output.md +++ b/docs/output.md @@ -658,11 +658,13 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir - `damageprofiler/` - this contains sample specific directories containing raw statistics and damage plots from DamageProfiler. The `.pdf` files can be used to visualise C to T miscoding lesions or read length distributions of your mapped reads. All raw statistics used for the PDF plots are contained in the `.txt` files. - `pmdtools/` - this contains raw output statistics of pmdtools (estimates of frequencies of substitutions), and BAM files which have been filtered to remove reads that do not have a Post-mortem damage (PMD) score of `--pmdtools_threshold`. - `trimmed_bam/` - this contains the BAM files with X number of bases trimmed off as defined with the `--bamutils_clip_half_udg_left`, `--bamutils_clip_half_udg_right`, `--bamutils_clip_none_udg_left`, and `--bamutils_clip_none_udg_right` flags and corresponding index files. You can use these BAM files for downstream analysis such as re-mapping data with more stringent parameters (if you set trimming to remove the most likely places containing damage in the read). +- `damage_rescaling/` - this contains rescaled BAM files from mapDamage2. These BAM files have damage probabilistically removed via a bayesian model, and can be used for downstream genotyping. - `genotyping/` - this contains all the (gzipped) genotyping files produced by your genotyping module. The file suffix will have the genotyping tool name. You will have files corresponding to each of your deduplicated BAM files (except pileupcaller), or any turned-on downstream processes that create BAMs (e.g. trimmed bams or pmd tools). If `--gatk_ug_keep_realign_bam` supplied, this may also contain BAM files from InDel realignment when using GATK 3 and UnifiedGenotyping for variant calling. When pileupcaller is used to create eigenstrat genotypes, this directory also contains eigenstrat SNP coverage statistics. - `multivcfanalyzer/` - this contains all output from MultiVCFAnalyzer, including SNP calling statistics, various SNP table(s) and FASTA alignment files. - `sex_determination/` - this contains the output for the sex determination run. This is a single `.tsv` file that includes a table with the sample name, the number of autosomal SNPs, number of SNPs on the X/Y chromosome, the number of reads mapping to the autosomes, the number of reads mapping to the X/Y chromosome, the relative coverage on the X/Y chromosomes, and the standard error associated with the relative coverages. These measures are provided for each bam file, one row per file. If the `sexdeterrmine_bedfile` option has not been provided, the error bars cannot be trusted, and runtime will be considerably longer. - `nuclear_contamination/` - this contains the output of the nuclear contamination processes. The directory contains one `*.X.contamination.out` file per individual, as well as `nuclear_contamination.txt` which is a summary table of the results for all individual. `nuclear_contamination.txt` contains a header, followed by one line per individual, comprised of the Method of Moments (MOM) and Maximum Likelihood (ML) contamination estimate (with their respective standard errors) for both Method1 and Method2. - `bedtools/` - this contains two files as the output from bedtools coverage. One file contains the 'breadth' coverage (`*.breadth.gz`). This file will have the contents of your annotation file (e.g. BED/GFF), and the following subsequent columns: no. reads on feature, # bases at depth, length of feature, and % of feature. The second file (`*.depth.gz`), contains the contents of your annotation file (e.g. BED/GFF), and an additional column which is mean depth coverage (i.e. average number of reads covering each position). +- `metagenomic_complexity_filter` - this contains the output from filtering of input reads to metagenomic classification of low-sequence complexity reads as performed by `bbduk`. This will include the filtered FASTQ files (`*_lowcomplexityremoved.fq.gz`) and also the run-time log (`_bbduk.stats`) for each sample. **Note:** there are no sections in the MultiQC report for this module, therefore you must check the `._bbduk.stats` files to get summary statistics of the filtering. - `metagenomic_classification/` - this contains the output for a given metagenomic classifier. - Running MALT will contain RMA6 files that can be loaded into MEGAN6 or MaltExtract for phylogenetic visualisation of read taxonomic assignments and aDNA characteristics respectively. Additional a `malt.log` file is provided which gives additional information such as run-time, memory usage and per-sample statistics of numbers of alignments with taxonomic assignment etc. This will also include gzip SAM files if requested. - Running kraken will contain the Kraken output and report files, as well as a merged Taxon count table. diff --git a/docs/usage.md b/docs/usage.md index b5094f606..7f0f4a6e3 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -289,6 +289,8 @@ Alternatively to the [direct input method](#direct-input-method), you can supply Schematic diagram indicating merging points of different types of libraries, given a TSV input. Dashed boxes are optional library-specific processes

+> Only different libraries from a single sample that have been BAM trimmed will be merged together. Rescaled or PMD filtered libraries will not be merged prior genotyping as each library _may_ have a different model applied to it and have their own biases (i.e. users may need to play around with settings to get the damage-removal optimal). + The use of the TSV `--input` method is recommended when performing more complex procedures such as lane or library merging. You do not need to specify `--single_end`, `--bam`, `--colour_chemistry`, `-udg_type` etc. when using TSV input - this is defined within the TSV file itself. You can only supply a single TSV per run (i.e. `--input '*.tsv'` will not work). This TSV should look like the following: @@ -391,203 +393,6 @@ hard drive footprint of the run, so be sure to do this! ## Troubleshooting and FAQs -### My pipeline update doesn't seem to do anything - -To download a new version of a pipeline, you can use the following, replacing -`` to the corresponding version. - -```bash -nextflow pull nf-core/eager -r -``` - -However, in very rare cases, minor fixes to a version will be pushed out without -a version number bump. This can confuse nextflow slightly, as it thinks you -already have the 'broken' version from your original pipeline download. - -If when running the pipeline you don't see any changes in the fixed version when -running it, you can try removing your nextflow EAGER cache typically stored in -your home directory with - -```bash -rm -r ~/.nextflow/assets/nf-core/eager -``` - -And re-pull the pipeline with the command above. This will install a fresh -version of the version with the fixes. - -### Input files not found - -When using the [direct input](#direct-input-method) method: if no file, only one -input file, or only 'read one' and not 'read two' is picked up then something is -likely wrong with your input file declaration ([`--input`](#--input)): - -1. The path must be enclosed in quotes (`'` or `"`) -2. The path must have at least one `*` wildcard character. This is even if you - are only running one paired end sample. -3. When using the pipeline with paired end data, the path must use `{1,2}` or - `{R1,R2}` notation to specify read pairs. -4. If you are running single-end data make sure to specify `--single_end` - -**Important**: The pipeline can't take a list of multiple input files when using -the direct input method - it takes a 'glob' expression. If your input files are -scattered in different paths then we recommend that you generate a directory -with symlinked files. If running in paired-end mode please make sure that your -files are sensibly named so that they can be properly paired. See the previous -point. - -If the pipeline can't find your files then you will get the following error - -```bash -ERROR ~ Cannot find any reads matching: *{1,2}.fastq.gz -``` - -If your sample name is "messy" then you have to be very particular with your -glob specification. A file name like `L1-1-D-2h_S1_L002_R1_001.fastq.gz` can be -difficult enough for a human to read. Specifying `*{1,2}*.gz` won't work give -you what you want whilst `*{R1,R2}*.gz` (i.e. the addition of the `R`s) will. - -If using the [TSV input](#tsv-input-method) method, this likely means there is a -mistake or typo in the path in a given column. Often this is a trailing space at -the end of the path. - -### I am only getting output for a single sample although I specified multiple with wildcards - -You must specify paths to files in quotes, otherwise your shell will evaluate -any wildcards (\*) rather than Nextflow. - -For example - -```bash -nextflow run nf-core/eager --input /path/to/sample_*/*.fq.gz -``` - -Would be evaluated by your shell as - -```bash -nextflow run nf-core/eager --input /path/to/sample_1/sample_1.fq.gz /path/to/sample_1/sample_1.fq.gz /path/to/sample_1/sample_1.fq.gz -``` - -And Nextflow will only take the first path after `--input`, ignoring the others. - -On the other hand, encapsulating the path in quotes will allow Nextflow to -evaluate the paths. - -```bash -nextflow run nf-core/eager --input "/path/to/sample_*/*.fq.gz" -``` - -### The pipeline crashes almost immediately with an early pipeline step - -Sometimes a newly downloaded and set up nf-core/eager pipeline will encounter an -issue where a run almost immediately crashes (e.g. at `fastqc`, -`output_documentation` etc.) saying the tool could not be found or similar. - -#### I am running Docker - -You may have an outdated container. This happens more often when running on the -`dev` branch of nf-core/eager, because Docker will _not_ update the container on -each new commit, and thus may not get new tools called within the pipeline code. - -To fix, just re-pull the nf-core/eager Docker container manually with: - -```bash -docker pull nfcore/eager:dev -``` - -#### I am running Singularity - -If you're running Singularity, it could be that Nextflow cannot access your -Singularity image properly - often due to missing bind paths. - -See -[here](https://nf-co.re/usage/troubleshooting#cannot-find-input-files-when-using-singularity) -for more information. - -### The pipeline has crashed with an error but Nextflow is still running - -If this happens, you can either wait until all other already running jobs to -safely finish, or if Nextflow _still_ does not stop press `ctrl + c` on your -keyboard (or equivalent) to stop the Nextflow run. - -> :warning: if you do this, and do not plan to fix the run make sure to delete -the output folder. Otherwise you may end up a lot of large intermediate files -being left! You can clean a Nextflow run of all intermediate files with -`nextflow clean -f -k` or delete the `work/` directory. - -### I get a exceeded job memory limit error - -While Nextflow tries to make your life easier by automatically retrying jobs -that run out of memory with more resources (until your specified max-limit), -sometimes you may have such large data you run out even after the default 3 -retries. - -To fix this you need to change the default memory requirements for the process -that is breaking. We can do this by making a custom profile, which we then -provide to the Nextflow run command. - -For example, lets say it's the `markduplicates` process that is running out of -memory. - -First we need to check to see what default memory value we have. We can do this -by going to the main [nf-core/eager code](https://github.com/nf-core/) and -opening the `main.nf` file. We can then use your browser's find functionality -for: `process markduplicates`. - -Once found, we then need to check the line called `label`. In this case the -label is `mc_small` (for multi-core small). - -Next we need to go back to the main github repository, and open -`conf/base.config`. Again using our find functionality, we search for: -`withLabel:'mc_small'`. - -We see that the `memory` is set to `4.GB` (`memory = { check_max( 4.GB * -task.attempt, 'memory' )})`). - -Now back on your computer, we need to make a new file called -`custom_resources.conf`. You should save it somewhere centrally so you can -reuse it. - -> If you think this would be useful for multiple people in your lab/institute, -> we highly recommend you make an institutional profile at -> [nf-core/configs](https://github.com/nf-core/configs). This will simplify this -> process in the future. - -Within this file, you will need to add the following: - -```txt -profiles { - big_data { - process { - withName: markduplicates { - memory = 16.GB - } - } - } -} -``` - -Where we have increased the default `4.GB` to `16.GB`. Make sure that you keep -the `check_max` function, as this prevents your run asking for too much memory -during retries. - -> Note that with this you will _not_ have the automatic retry mechanism. If -> you want this, re-add the `check_max()` function on the `memory` line, and -> add to the bottom of the entire file (outside the profiles block), the -> block starting `def check_max(obj, type) {`, which is at the end of the -> [nextflow.config file](https://github.com/nf-core/eager/blob/master/nextflow.config) - -Once saved, we can then modify your original Nextflow run command: - -```bash -nextflow run nf-core/eager -r 2.2.0 -c ///custom_resources.conf -profile big_data,, <...> -``` - -Where we have added `-c` to specify which file to use for the custom profiles, -and then added the `big_data` profile to the original profiles you were using. - -:warning: it's important that big_data comes first, to ensure it overwrites any -parameters set in the subsequent profiles! - ### I get a file name collision error during merging When using TSV input, nf-core/eager will attempt to merge all `Lanes` of a @@ -608,37 +413,6 @@ they are unique (e.g. if one library was sequenced on Lane 8 of two HiSeq runs, specify lanes as 8 and 16 for each FASTQ file respectively). For library merging errors, you must modify your `Library_ID`s accordingly, to make them unique. -### I specified a module and it didn't produce the expected output - -Possible options: - -1. Check there if you have a typo in the parameter name. Nextflow _does not_ - check for this -2. Check that an upstream module was turned on (if a module requires the output - of a previous module, it will not be activated unless it receives the output) - -### I get a unable to acquire lock - -Errors like the following - -```bash -Unable to acquire lock on session with ID 84333844-66e3-4846-a664-b446d070f775 -``` - -normally suggest a previous Nextflow run (on the same folder) was not cleanly -killed by a user (e.g. using ctrl + z to hard kill a crashed run). - -To fix this, you must clean the entirety of the output directory (including -output files) e.g. with `rm -r /* /.*` and re-running -from scratch. - -`ctrl +z` is **not** a recommended way of killing a Nextflow job. Runs that take -a long time to fail are often still running because other job submissions are -still running. Nextflow will normally wait for those processes to complete -before cleaning shutting down the run (to allow rerunning of a run with -`-resume`). `ctrl + c` is much safer as it will tell Nextflow to stop earlier -but cleanly. - ## Tutorials ### Tutorial - How to investigate a failed run diff --git a/environment.yml b/environment.yml index e0e05c545..afa4cf6f7 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,6 @@ # You can use this file to create a conda environment for this pipeline: # conda env create -f environment.yml -name: nf-core-eager-2.2.2 +name: nf-core-eager-2.3 channels: - conda-forge - bioconda @@ -18,7 +18,7 @@ dependencies: - bioconda::bwa=0.7.17 - bioconda::picard=2.22.9 - bioconda::samtools=1.9 - - bioconda::dedup=0.12.7 + - bioconda::dedup=0.12.8 - bioconda::angsd=0.933 - bioconda::circularmapper=1.93.5 - bioconda::gatk4=4.1.7.0 @@ -47,3 +47,6 @@ dependencies: - conda-forge::xopen=0.9.0 - bioconda::bowtie2=2.4.1 - bioconda::eigenstratdatabasetools=1.0.2 + - bioconda::mapdamage2=2.2.0 + - bioconda::bbmap=38.87 + diff --git a/main.nf b/main.nf index ee9883454..3eaac7434 100644 --- a/main.nf +++ b/main.nf @@ -87,7 +87,7 @@ def helpMessage() { --bwaalnl [num] Specify the -l parameter for BWA aln, i.e. length of seeds to be used. Set to 1024 for whole read. Default: ${params.bwaalnl} --circularextension [num] Specify the number of bases to extend reference by (circularmapper only). Default: ${params.circularextension} --circulartarget [chr] Specify the FASTA header of the target chromosome to extend(circularmapper only). Default: '${params.circulartarget}' - --circularfilter [bool] Turn on to filter off-target reads (circularmapper only). + --circularfilter [bool] Turn on to remove reads that did not map to the circularised genome (circularmapper only). --bt2_alignmode [str] Specify the bowtie2 alignment mode. Options: 'local', 'end-to-end'. Default: '${params.bt2_alignmode}' --bt2_sensitivity [str] Specify the level of sensitivity for the bowtie2 alignment mode. Options: 'no-preset', 'very-fast', 'fast', 'sensitive', 'very-sensitive'. Default: '${params.bt2_sensitivity}' --bt2n [num] Specify the -N parameter for bowtie2 (mismatches in seed). This will override defaults from alignmode/sensitivity. Default: ${params.bt2n} @@ -116,12 +116,15 @@ def helpMessage() { --damageprofiler_length [num] Specify length filter for DamageProfiler. Default: ${params.damageprofiler_length} --damageprofiler_threshold [num] Specify number of bases of each read to consider for DamageProfiler calculations. Default: ${params.damageprofiler_threshold} --damageprofiler_yaxis [float] Specify the maximum misincorporation frequency that should be displayed on damage plot. Set to 0 to 'autoscale'. Default: ${params.damageprofiler_yaxis} + --run_mapdamage_rescaling Turn on damage rescaling of BAM files using mapDamage2 to probabilistically remove damage. + --rescale_length_5p Length of read for mapDamage2 to rescale from 5p end. Default: ${params.rescale_length_5p} + --rescale_length_3p Length of read for mapDamage2 to rescale from 5p end. Default: ${params.rescale_length_3p} --run_pmdtools [bool] Turn on PMDtools --pmdtools_range [num] Specify range of bases for PMDTools. Default: ${params.pmdtools_range} --pmdtools_threshold [num] Specify PMDScore threshold for PMDTools. Default: ${params.pmdtools_threshold} --pmdtools_reference_mask [file] Specify a path to reference mask for PMDTools. --pmdtools_max_reads [num] Specify the maximum number of reads to consider for metrics generation. Default: ${params.pmdtools_max_reads} - + Annotation Statistics --run_bedtools_coverage [bool] Turn on ability to calculate no. reads, depth and breadth coverage of features in reference. --anno_file [file] Path to GFF or BED file containing positions of features in reference file (--fasta). Path should be enclosed in quotes. @@ -137,7 +140,7 @@ def helpMessage() { Genotyping --run_genotyping [bool] Turn on genotyping of BAM files. --genotyping_tool [str] Specify which genotyper to use either GATK UnifiedGenotyper, GATK HaplotypeCaller, Freebayes, or pileupCaller. Options: 'ug', 'hc', 'freebayes', 'pileupcaller', 'angsd'. - --genotyping_source [str] Specify which input BAM to use for genotyping. Options: 'raw', 'trimmed' or 'pmd'. Default: '${params.genotyping_source}' + --genotyping_source [str] Specify which input BAM to use for genotyping. Options: 'raw', 'trimmed', 'pmd', 'rescaled'. Default: '${params.genotyping_source}' --gatk_call_conf [num] Specify GATK phred-scaled confidence threshold. Default: ${params.gatk_call_conf} --gatk_ploidy [num] Specify GATK organism ploidy. Default: ${params.gatk_ploidy} --gatk_downsample [num] Maximum depth coverage allowed for genotyping before down-sampling is turned on. Default: ${params.gatk_downsample} @@ -193,19 +196,21 @@ def helpMessage() { --contamination_chrom_name [str] The name of the X chromosome in your bam or FASTA header. 'X' for hs37d5, 'chrX' for HG19. Default: '${params.contamination_chrom_name}' Metagenomic Screening - --run_metagenomic_screening [bool] Turn on metagenomic screening module for reference-unmapped reads - --metagenomic_tool [str] Specify which classifier to use. Options: 'malt', 'kraken'. Default: '${params.contamination_chrom_name}' - --database [dir] Specify path to classifer database directory. For Kraken2 this can also be a `.tar.gz` of the directory. - --metagenomic_min_support_reads [num] Specify a minimum number of reads a taxon of sample total is required to have to be retained. Not compatible with . Default: ${params.metagenomic_min_support_reads} - --percent_identity [num] Percent identity value threshold for MALT. Default: ${params.percent_identity} - --malt_mode [str] Specify which alignment method to use for MALT. Options: 'Unknown', 'BlastN', 'BlastP', 'BlastX', 'Classifier'. Default: '${params.malt_mode}' - --malt_alignment_mode [str] Specify alignment method for MALT. Options: 'Local', 'SemiGlobal'. Default: '${params.malt_alignment_mode}' - --malt_top_percent [num] Specify the percent for LCA algorithm for MALT (see MEGAN6 CE manual). Default: ${params.malt_top_percent} - --malt_min_support_mode [str] Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained for MALT. Options: 'percent', 'reads'. Default: '${params.malt_min_support_mode}' - --malt_min_support_percent [num] Specify the minimum percentage of reads a taxon of sample total is required to have to be retained for MALT. Default: Default: ${params.malt_min_support_percent} - --malt_max_queries [num] Specify the maximium number of queries a read can have for MALT. Default: ${params.malt_max_queries} - --malt_memory_mode [str] Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow. Options: 'load', 'page', 'map'. Default: '${params.malt_memory_mode}' - --malt_sam_output [bool] Specify to also produce SAM alignment files. Note this includes both aligned and unaligned reads, and are gzipped. Note this will result in very large file sizes. + --metagenomic_complexity_filter Turn on removal of low-sequence complexity reads for metagenomic screening with bbduk. + --metagenomic_complexity_entropy Specify the entropy threshold that under which a sequencing read will be complexity filtered out. This should be between 0-1. Default: '${params.metagenomic_complexity_entropy}' + --run_metagenomic_screening [bool] Turn on metagenomic screening module for reference-unmapped reads + --metagenomic_tool [str] Specify which classifier to use. Options: 'malt', 'kraken'. Default: '${params.contamination_chrom_name}' + --database [dir] Specify path to classifer database directory. For Kraken2 this can also be a `.tar.gz` of the directory. + --metagenomic_min_support_reads [num] Specify a minimum number of reads a taxon of sample total is required to have to be retained. Not compatible with . Default: ${params.metagenomic_min_support_reads} + --percent_identity [num] Percent identity value threshold for MALT. Default: ${params.percent_identity} + --malt_mode [str] Specify which alignment method to use for MALT. Options: 'Unknown', 'BlastN', 'BlastP', 'BlastX', 'Classifier'. Default: '${params.malt_mode}' + --malt_alignment_mode [str] Specify alignment method for MALT. Options: 'Local', 'SemiGlobal'. Default: '${params.malt_alignment_mode}' + --malt_top_percent [num] Specify the percent for LCA algorithm for MALT (see MEGAN6 CE manual). Default: ${params.malt_top_percent} + --malt_min_support_mode [str] Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained for MALT. Options: 'percent', 'reads'. Default: '${params.malt_min_support_mode}' + --malt_min_support_percent [num] Specify the minimum percentage of reads a taxon of sample total is required to have to be retained for MALT. Default: Default: ${params.malt_min_support_percent} + --malt_max_queries [num] Specify the maximium number of queries a read can have for MALT. Default: ${params.malt_max_queries} + --malt_memory_mode [str] Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow. Options: 'load', 'page', 'map'. Default: '${params.malt_memory_mode}' + --malt_sam_output [bool] Specify to also produce SAM alignment files. Note this includes both aligned and unaligned reads, and are gzipped. Note this will result in very large file sizes. Metagenomic Authentication --run_maltextract [bool] Turn on MaltExtract for MALT aDNA characteristics authentication @@ -285,7 +290,7 @@ if("${params.fasta}".endsWith(".gz")){ path zipped_fasta from file(params.fasta) // path doesn't like it if a string of an object is not prefaced with a root dir (/), so use file() to resolve string before parsing to `path` output: - path "$unzip" into ch_fasta into ch_fasta_for_bwaindex,ch_fasta_for_bt2index,ch_fasta_for_faidx,ch_fasta_for_seqdict,ch_fasta_for_circulargenerator,ch_fasta_for_circularmapper,ch_fasta_for_damageprofiler,ch_fasta_for_qualimap,ch_fasta_for_pmdtools,ch_fasta_for_genotyping_ug,ch_fasta_for_genotyping_hc,ch_fasta_for_genotyping_freebayes,ch_fasta_for_genotyping_pileupcaller,ch_fasta_for_vcf2genome,ch_fasta_for_multivcfanalyzer,ch_fasta_for_genotyping_angsd + path "$unzip" into ch_fasta into ch_fasta_for_bwaindex,ch_fasta_for_bt2index,ch_fasta_for_faidx,ch_fasta_for_seqdict,ch_fasta_for_circulargenerator,ch_fasta_for_circularmapper,ch_fasta_for_damageprofiler,ch_fasta_for_qualimap,ch_fasta_for_pmdtools,ch_fasta_for_genotyping_ug,ch_fasta_for_genotyping_hc,ch_fasta_for_genotyping_freebayes,ch_fasta_for_genotyping_pileupcaller,ch_fasta_for_vcf2genome,ch_fasta_for_multivcfanalyzer,ch_fasta_for_genotyping_angsd,ch_fasta_for_damagerescaling script: unzip = zipped_fasta.toString() - '.gz' @@ -296,7 +301,7 @@ if("${params.fasta}".endsWith(".gz")){ } else { fasta_for_indexing = Channel .fromPath("${params.fasta}", checkIfExists: true) - .into{ ch_fasta_for_bwaindex; ch_fasta_for_bt2index; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_genotyping_pileupcaller; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer;ch_fasta_for_genotyping_angsd } + .into{ ch_fasta_for_bwaindex; ch_fasta_for_bt2index; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_genotyping_pileupcaller; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer;ch_fasta_for_genotyping_angsd;ch_fasta_for_damagerescaling } } // Check that fasta index file path ends in '.fai' @@ -413,8 +418,13 @@ if (params.sexdeterrmine_bedfile == '') { // Genotyping validation if (params.run_genotyping){ + + if (params.genotyping_source != 'raw' && params.genotyping_source != 'pmd' && params.genotyping_source != 'trimmed' && params.genotyping_source != 'rescaled' ) { + exit 1, "[nf-core/eager] error: please specify a valid genotyping source. Options: 'raw', 'pmd', 'trimmed', 'rescaled'. Found parameter: --genotyping_source '${params.genotyping_source}'." + } + if (params.genotyping_tool != 'ug' && params.genotyping_tool != 'hc' && params.genotyping_tool != 'freebayes' && params.genotyping_tool != 'pileupcaller' && params.genotyping_tool != 'angsd' ) { - exit 1, "[nf-core/eager] error: please specify a genotyper. Options: 'ug', 'hc', 'freebayes', 'pileupcaller'. Found parameter: --genotyping_tool '${params.genotyping_tool}'." + exit 1, "[nf-core/eager] error: please specify a valid genotyper. Options: 'ug', 'hc', 'freebayes', 'pileupcaller'. Found parameter: --genotyping_tool '${params.genotyping_tool}'." } if (params.gatk_ug_out_mode != 'EMIT_VARIANTS_ONLY' && params.gatk_ug_out_mode != 'EMIT_ALL_CONFIDENT_SITES' && params.gatk_ug_out_mode != 'EMIT_ALL_SITES') { @@ -506,6 +516,7 @@ if (params.run_multivcfanalyzer) { } // Metagenomic validation + if (params.run_metagenomic_screening) { if ( params.bam_unmapped_type == "discard" ) { exit 1, "[nf-core/eager] error: metagenomic classification can only run on unmapped reads. Please supply --bam_unmapped_type 'fastq'. Supplied: --bam_unmapped_type '${params.bam_unmapped_type}'." @@ -808,6 +819,8 @@ Channel.from(summary.collect{ [it.key, it.value] }) """.stripIndent() } .set { ch_workflow_summary } +log.info "Schaffa, Schaffa, Genome Baua!" + /////////////////////////////////////////////////// /* -- REFERENCE FASTA INDEXING -- */ /////////////////////////////////////////////////// @@ -1008,7 +1021,7 @@ process indexinputbam { // Raw sequencing QC - allow user evaluate if sequencing any good? process fastqc { - label 'sc_small' + label 'mc_small' tag "${libraryid}_L${lane}" publishDir "${params.outdir}/fastqc/input_fastq", mode: params.publish_dir_mode, saveAs: { filename -> @@ -1079,7 +1092,7 @@ process fastp { """ } else { """ - fastp --in1 ${r1} --in2 ${r2} --out1 "${r1.baseName}.pG.fq.gz" --out2 "${r2.baseName}.pG.fq.gz" -A -g --poly_g_min_len "${params.complexity_filter_poly_g_min}" -Q -L -w ${task.cpus} --json "${libraryid}"_L${lane}_fastp.json + fastp --in1 ${r1} --in2 ${r2} --out1 "${r1.baseName}.pG.fq.gz" --out2 "${r2.baseName}.pG.fq.gz" -A -g --poly_g_min_len "${params.complexity_filter_poly_g_min}" -Q -L -w ${task.cpus} --json "${libraryid}"_L${lane}_polyg_fastp.json """ } } @@ -1422,7 +1435,7 @@ process lanemerge_hostremoval_fastq { // Post-preprocessing QC to help user check pre-processing removed all sequencing artefacts process fastqc_after_clipping { - label 'sc_small' + label 'mc_small' tag "${libraryid}_L${lane}" publishDir "${params.outdir}/fastqc/after_clipping", mode: params.publish_dir_mode, saveAs: { filename -> @@ -1541,6 +1554,7 @@ process circulargenerator{ else null } + input: file fasta from ch_fasta_for_circulargenerator @@ -1578,7 +1592,7 @@ process circularmapper{ params.mapper == 'circularmapper' script: - def filter = params.circularfilter ? '' : '-f true -x false' + def filter = params.circularfilter ? '-f true -x true' : '' def elongated_root = "${fasta.baseName}_${params.circularextension}.fasta" def size = params.large_ref ? '-c' : '' @@ -1853,7 +1867,7 @@ process samtools_filter { output: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*filtered.bam"), file("*.{bai,csi}") into ch_output_from_filtering - tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.unmapped.fastq.gz") optional true into ch_bam_filtering_for_metagenomic + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.unmapped.fastq.gz") optional true into ch_bam_filtering_for_metagenomic,ch_metagenomic_for_skipentropyfilter tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.unmapped.bam") optional true // Using shell block rather than script because we are playing with awk @@ -2172,11 +2186,11 @@ process library_merge { if (!params.skip_deduplication) { ch_input_for_skiplibrarymerging.mix(ch_output_from_librarymerging) .filter { it =~/.*_rmdup.bam/ } - .into { ch_rmdup_for_skipdamagemanipulation; ch_rmdup_for_pmdtools; ch_rmdup_for_bamutils; ch_rmdup_for_bedtools } + .into { ch_rmdup_for_skipdamagemanipulation; ch_rmdup_for_pmdtools; ch_rmdup_for_bamutils; ch_rmdup_for_bedtools; ch_rmdup_for_damagerescaling } } else { ch_input_for_skiplibrarymerging.mix(ch_output_from_librarymerging) - .into { ch_rmdup_for_skipdamagemanipulation; ch_rmdup_for_pmdtools; ch_rmdup_for_bamutils; ch_rmdup_for_bedtools } + .into { ch_rmdup_for_skipdamagemanipulation; ch_rmdup_for_pmdtools; ch_rmdup_for_bamutils; ch_rmdup_for_bedtools; ch_rmdup_for_damagerescaling } } ////////////////////////////////////////////////// @@ -2282,7 +2296,37 @@ process damageprofiler { """ } -// Optionally perform further aDNA evaluation or filtering for just reads with damage etc. +// Damage rescaling with mapDamage + +process mapdamage_rescaling { + + label 'sc_small' + tag "${libraryid}" + + publishDir "${params.outdir}/damage_rescaling", mode: params.publish_dir_mode + + when: + params.run_mapdamage_rescaling + + input: + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(bam), path(bai) from ch_rmdup_for_damagerescaling + file fasta from ch_fasta_for_damagerescaling.collect() + + output: + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*_rescaled.bam"), path("*rescaled.bam.{bai,csi}") into ch_output_from_damagerescaling + + script: + def base = "${bam.baseName}" + def singlestranded = strandedness == "single" ? '--single-stranded' : '' + def size = params.large_ref ? '-c' : '' + """ + mapDamage -i ${bam} -r ${fasta} --rescale --rescale-out ${bam}_rescaled.bam --rescale-length-5p ${params.rescale_length_5p} --rescale-length-3p=${params.rescale_length_3p} ${singlestranded} + samtools index ${bam}_rescaled.bam ${size} + """ + +} + +// Optionally perform further aDNA evaluation or filtering for just reads with damage etc. process pmdtools { label 'mc_small' @@ -2369,7 +2413,7 @@ process bam_trim { """ } -// Post trimming merging of libraries to single samples, except for SS/DS +// Post-trimming merging of libraries to single samples, except for SS/DS // libraries as they should be genotyped separately, because we will assume // that if trimming is turned on, 'lab-removed' libraries can be combined with // merged with 'in-silico damage removed' libraries to improve genotyping @@ -2443,7 +2487,7 @@ process qualimap { script: def snpcap = params.snpcapture_bed != '' ? "-gff ${params.snpcapture_bed}" : '' """ - qualimap bamqc -bam $bam -nt ${task.cpus} -outdir . -outformat "HTML" ${snpcap} + qualimap bamqc -bam $bam -nt ${task.cpus} -outdir . -outformat "HTML" ${snpcap} --java-mem-size=${task.memory.toGiga()}G """ } @@ -2464,12 +2508,19 @@ if ( params.run_genotyping && params.genotyping_source == 'raw' ) { .into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes; ch_damagemanipulation_for_genotyping_pileupcaller; ch_damagemanipulation_for_genotyping_angsd } } else if ( params.run_genotyping && params.genotyping_source == "pmd" && !params.run_pmdtools ) { - exit 1, "[nf-core/eager] error: Cannot run genotyping with 'pmd' source without running pmtools (--run_pmdtools)! Please check input parameters." + exit 1, "[nf-core/eager] error: Cannot run genotyping with 'pmd' source without running pmdtools (--run_pmdtools)! Please check input parameters." } else if ( params.run_genotyping && params.genotyping_source == "pmd" && params.run_pmdtools ) { ch_output_from_pmdtools .into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes; ch_damagemanipulation_for_genotyping_pileupcaller; ch_damagemanipulation_for_genotyping_angsd } +} else if ( params.run_genotyping && params.genotyping_source == "rescaled" && params.run_mapdamage_rescaling) { + ch_output_from_damagerescaling + .into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes; ch_damagemanipulation_for_genotyping_pileupcaller; ch_damagemanipulation_for_genotyping_angsd } + +} else if ( params.run_genotyping && params.genotyping_source == "rescaled" && !params.run_mapdamage_rescaling) { + exit 1, "[nf-core/eager] error: Cannot run genotyping with 'rescaled' source without running damage rescaling (--run_damagescaling)! Please check input parameters." + } else if ( !params.run_genotyping && !params.run_trim_bam && !params.run_pmdtools ) { ch_rmdup_for_skipdamagemanipulation .into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes; ch_damagemanipulation_for_genotyping_pileupcaller; ch_damagemanipulation_for_genotyping_angsd } @@ -2640,36 +2691,36 @@ process genotyping_pileupcaller { """ samtools mpileup -B -q 30 -Q 30 ${use_bed} -f ${fasta} ${bam_list} | pileupCaller ${caller} ${ssmode} ${transitions_mode} --sampleNames ${sample_names} ${use_snp} -e pileupcaller.${strandedness} """ - } - +} + process eigenstrat_snp_coverage { - label 'mc_tiny' - tag "${strandedness}" - publishDir "${params.outdir}/genotyping", mode: params.publish_dir_mode - - when: - params.run_genotyping && params.genotyping_tool == 'pileupcaller' - - input: - tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*") from ch_for_eigenstrat_snp_coverage.dump() - - output: - tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*.json") into ch_eigenstrat_snp_cov_for_multiqc - path("*_eigenstrat_coverage.txt") - - script: - /* - The following code block can be swapped in once the eigenstratdatabasetools MultiQC module becomes available. - """ - eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt -j ${strandedness}_eigenstrat_coverage_mqc.json - """ - */ - """ - eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt - parse_snp_cov.py ${strandedness}_eigenstrat_coverage.txt - """ - } - + label 'mc_tiny' + tag "${strandedness}" + publishDir "${params.outdir}/genotyping", mode: params.publish_dir_mode + + when: + params.run_genotyping && params.genotyping_tool == 'pileupcaller' + + input: + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*") from ch_for_eigenstrat_snp_coverage.dump(tag:'eigenstrat_input') + + output: + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*.json") into ch_eigenstrat_snp_cov_for_multiqc + path("*_eigenstrat_coverage.txt") + + script: + /* + The following code block can be swapped in once the eigenstratdatabasetools MultiQC module becomes available. + """ + eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt -j ${strandedness}_eigenstrat_coverage_mqc.json + """ + */ + """ + eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt + parse_snp_cov.py ${strandedness}_eigenstrat_coverage.txt + """ +} + process genotyping_angsd { label 'mc_small' tag "${samplename}" @@ -2897,22 +2948,57 @@ process print_nuclear_contamination{ /* -- METAGENOMICS-SPECIFIC ADDITIONAL STEPS -- */ ///////////////////////////////////////////////////////// +// Low entropy read filter to reduce input sequences of reads that are highly uninformative, and thus reduce runtime/false positives + +process metagenomic_complexity_filter { + label 'mc_small' + tag "${samplename}" + publishDir "${params.outdir}/metagenomic_complexity_filter/", mode: params.publish_dir_mode + + when: + params.metagenomic_complexity_filter + + input: + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(fastq) from ch_bam_filtering_for_metagenomic + + + output: + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*_lowcomplexityremoved.fq.gz") into ch_lowcomplexityfiltered_for_metagenomic + path("*_bbduk.stats") into ch_metagenomic_complexity_filter_for_multiqc + + script: + """ + bbduk.sh -Xmx${task.memory.toGiga()}g in=${fastq} threads=${task.cpus} entropymask=f entropy=${params.metagenomic_complexity_entropy} out=${fastq}_lowcomplexityremoved.fq.gz 2> ${fastq}_bbduk.stats + """ + +} + +// metagenomic complexity filter bypass + +if ( params.metagenomic_complexity_filter ) { + ch_lowcomplexityfiltered_for_metagenomic + .set{ ch_filtered_for_metagenomic } +} else { + ch_metagenomic_for_skipentropyfilter + .set{ ch_filtered_for_metagenomic } +} + // MALT is a super-fast BLAST replacement typically used for pathogen detection or microbiome profiling against large databases, here using off-target reads from mapping // As we collect all files for a all metagenomic runs, we DO NOT use the normal input/output tuple! if (params.metagenomic_tool == 'malt') { - ch_bam_filtering_for_metagenomic - .set {ch_bam_filtering_for_metagenomic_malt} + ch_filtered_for_metagenomic + .set {ch_input_for_metagenomic_malt} - ch_bam_filtering_for_metagenomic_kraken = Channel.empty() + ch_input_for_metagenomic_kraken = Channel.empty() } else if (params.metagenomic_tool == 'kraken') { - ch_bam_filtering_for_metagenomic - .set {ch_bam_filtering_for_metagenomic_kraken} + ch_filtered_for_metagenomic + .set {ch_input_for_metagenomic_kraken} - ch_bam_filtering_for_metagenomic_malt = Channel.empty() + ch_input_for_metagenomic_malt = Channel.empty() } else if ( params.metagenomic_tool == '' ) { - ch_bam_filtering_for_metagenomic_malt = Channel.empty() - ch_bam_filtering_for_metagenomic_kraken = Channel.empty() + ch_input_for_metagenomic_malt = Channel.empty() + ch_input_for_metagenomic_kraken = Channel.empty() } @@ -2925,7 +3011,7 @@ process malt { params.run_metagenomic_screening && params.run_bam_filtering && params.bam_unmapped_type == 'fastq' && params.metagenomic_tool == 'malt' input: - file fastqs from ch_bam_filtering_for_metagenomic_malt.map { it[7] }.collect() + file fastqs from ch_input_for_metagenomic_malt.map { it[7] }.collect() file db from ch_db_for_malt output: @@ -3043,7 +3129,7 @@ process kraken { params.run_metagenomic_screening && params.run_bam_filtering && params.bam_unmapped_type == 'fastq' && params.metagenomic_tool == 'kraken' input: - path(fastq) from ch_bam_filtering_for_metagenomic_kraken.map { it[7] } + path(fastq) from ch_input_for_metagenomic_kraken.map { it[7] } path(krakendb) from ch_krakendb output: @@ -3165,6 +3251,8 @@ process get_software_versions { pileupCaller --version &> v_sequencetools.txt 2>&1 || true bowtie2 --version | grep -a 'bowtie2-.* -fdebug' > v_bowtie2.txt || true eigenstrat_snp_coverage --version | cut -d ' ' -f2 >v_eigenstrat_snp_coverage.txt || true + mapDamage2 --version > v_mapdamage.txt || true + bbduk.sh | grep 'Last modified' | cut -d' ' -f 3-99 > v_bbduk.txt || true scrape_software_versions.py &> software_versions_mqc.yaml """ @@ -3198,6 +3286,7 @@ process multiqc { file ('mutnucratio/*') from ch_mtnucratio_for_multiqc.collect().ifEmpty([]) file ('endorspy/*') from ch_endorspy_for_multiqc.collect().ifEmpty([]) file ('multivcfanalyzer/*') from ch_multivcfanalyzer_for_multiqc.collect().ifEmpty([]) + file ('fastp_lowcomplexityfilter/*') from ch_metagenomic_complexity_filter_for_multiqc.collect().ifEmpty([]) file ('malt/*') from ch_malt_for_multiqc.collect().ifEmpty([]) file ('kraken/*') from ch_kraken_for_multiqc.collect().ifEmpty([]) file ('hops/*') from ch_hops_for_multiqc.collect().ifEmpty([]) @@ -3391,7 +3480,7 @@ def checkHostname() { def extract_data(tsvFile) { Channel.fromPath(tsvFile) .splitCsv(header: true, sep: '\t') - .dump() + .dump(tag:'tsv_extract') .map { row -> def expected_keys = ['Sample_Name', 'Library_ID', 'Lane', 'Colour_Chemistry', 'SeqType', 'Organism', 'Strandedness', 'UDG_Treatment', 'R1', 'R2', 'BAM'] diff --git a/nextflow.config b/nextflow.config index c9fcecfe9..8be4e8ad7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -110,6 +110,10 @@ params { pmdtools_reference_mask = '' pmdtools_max_reads = 10000 + // mapDamage + run_mapdamage_rescaling = false + params.rescale_length_5p = 12 + params.rescale_length_3p = 12 //Bedtools settings run_bedtools_coverage = false @@ -185,8 +189,12 @@ params { run_nuclear_contamination = false contamination_chrom_name = 'X' // Default to using hs37d5 name - // taxonomic classifer + // taxonomic classifier run_metagenomic_screening = false + + metagenomic_complexity_filter = false + metagenomic_complexity_entropy = 0.3 + metagenomic_tool = '' database = '' metagenomic_min_support_reads = 1 @@ -243,7 +251,7 @@ params { // Container slug. Stable releases should specify release tag! // Developmental code should specify :dev -process.container = 'nfcore/eager:2.2.2' +process.container = 'nfcore/eager:2.3' // Load base.config by default for all pipelines includeConfig 'conf/base.config' @@ -337,7 +345,7 @@ manifest { description = 'A fully reproducible and state-of-the-art ancient DNA analysis pipeline' mainScript = 'main.nf' nextflowVersion = '!>=20.04.0' - version = '2.2.2' + version = '2.3' } // Function to ensure that resource requirements don't go beyond diff --git a/nextflow_schema.json b/nextflow_schema.json index 6c14dc9c9..bc51825a0 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -555,9 +555,9 @@ }, "circularfilter": { "type": "boolean", - "description": "Turn on to filter off-target reads (circularmapper only).", + "description": "Turn on to remove reads that did not map to the circularised genome (circularmapper only).", "fa_icon": "fas fa-filter", - "help_text": "If you want to filter out reads that don't map to a circular chromosome, turn this on. By default this option is turned off.\n" + "help_text": "If you want to filter out reads that don't map to a circular chromosome (and also non-circular chromosome headers) from the resulting BAM file, turn this on. By default this option is turned off.\n> Modifies -f and -x parameters of CircularMapper's realignsamfile\n" }, "bt2_alignmode": { "type": "string", @@ -790,6 +790,26 @@ "description": "Specify the maximum number of reads to consider for metrics generation.", "fa_icon": "fas fa-greater-than-equal", "help_text": "The maximum number of reads used for damage assessment in PMDtools. Can be used to significantly reduce the amount of time required for damage assessment in PMDTools. Note that a too low value can also obtain incorrect results.\n\n> Modifies PMDTools parameter: `-n`" + }, + "run_mapdamage_rescaling": { + "type": "boolean", + "fa_icon": "fas fa-map", + "description": "Turn on damage rescaling of BAM files using mapDamage2 to probabilistically remove damage.", + "help_text": "Turns on mapDamage2's BAM rescaling functionality. This probablistically replaces Ts back to Cs depending on the likelihood this reference-mismatch was originally caused by damage. If the library is specified to be single stranded, this will automatically use the `--single-stranded` mode.\n\nThis functionality does not have any MultiQC output.\n\n:warning: rescaled libraries will not be merged with non-scaled libraries of the same sample for downstream genotyping, as the model may be different for each library. If you wish to merge these, please do this manually and re-run nf-core/eager using the merged BAMs as input. \n\n> Modifies the `--rescale` parameter of mapDamage2" + }, + "rescale_length_5p": { + "type": "integer", + "default": 12, + "fa_icon": "fas fa-balance-scale-right", + "description": "Length of read for mapDamage2 to rescale from 5p end.", + "help_text": "Specify the length from the end of the read that mapDamage should rescale.\n\n> Modifies the `--rescale-length-5p` parameter of mapDamage2." + }, + "rescale_length_3p": { + "type": "integer", + "default": 12, + "fa_icon": "fas fa-balance-scale-left", + "description": "Length of read for mapDamage2 to rescale from 3p end.", + "help_text": "Specify the length from the end of the read that mapDamage should rescale.\n\n> Modifies the `--rescale-length-3p` parameter of mapDamage2." } }, "fa_icon": "fas fa-chart-line", @@ -895,9 +915,15 @@ "genotyping_source": { "type": "string", "default": "raw", - "description": "Specify which input BAM to use for genotyping. Options: 'raw', 'trimmed' or 'pmd'.", + "description": "Specify which input BAM to use for genotyping. Options: 'raw', 'trimmed', 'pmd' or 'rescaled'.", "fa_icon": "fas fa-faucet", - "help_text": "Indicates which BAM file to use for genotyping, depending on what BAM processing modules you have turned on. Options are: `'raw'` for mapped only, filtered, or DeDup BAMs (with priority right to left); `'trimmed'` (for base clipped BAMs); `'pmd'` (for pmdtools output). Default is: `'raw'`.\n" + "help_text": "Indicates which BAM file to use for genotyping, depending on what BAM processing modules you have turned on. Options are: `'raw'` for mapped only, filtered, or DeDup BAMs (with priority right to left); `'trimmed'` (for base clipped BAMs); `'pmd'` (for pmdtools output); `'rescaled'` (for mapDamage2 rescaling output). Default is: `'raw'`.\n", + "enum": [ + "raw", + "pmd", + "trimmed", + "rescaled" + ] }, "gatk_call_conf": { "type": "integer", @@ -1291,6 +1317,21 @@ "description": "Options for metagenomic screening of off-target reads.", "default": "", "properties": { + "metagenomic_complexity_filter": { + "type": "boolean", + "description": "Turn on removal of low-sequence complexity reads for metagenomic screening with bbduk", + "help_text": "Turns on low-sequence complexity filtering of off-target reads using `bbduk`.\n\nThis is typically performed to reduce the number of uninformative reads or potential false-positive reads, typically for input for metagenomic screening. This thus reduces false positive species IDs and also run-time and resource requirements.\n\nSee `--metagenomic_complexity_entropy` for how complexity is calculated. **Important** There are no MultiQC output results for this module, you must check the number of reads removed with the `_bbduk.stats` output file.\n\nDefault: off\n", + "fa_icon": "fas fa-filter" + }, + "metagenomic_complexity_entropy": { + "type": "number", + "default": 0.3, + "description": "Specify the entropy threshold that under which a sequencing read will be complexity filtered out. This should be between 0-1.", + "minimum": 0, + "maximum": 1, + "help_text": "Specify a minimum entropy threshold that under which it will be _removed_ from the FASTQ file that goes into metagenomic screening. \n\nA mono-nucleotide read such as GGGGGG will have an entropy of 0, a completely random sequence has an entropy of almost 1.\n\nSee the `bbduk` [documentation](https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/bbduk-guide/-filter) on entropy for more information.\n\n> Modifies`bbduk` parameter `entropy=`", + "fa_icon": "fas fa-percent" + }, "run_metagenomic_screening": { "type": "boolean", "description": "Turn on metagenomic screening module for reference-unmapped reads.",