diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 976a7a107..5897abf87 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -98,6 +98,9 @@ jobs: - name: MAPPER_BWAMEM Test running with BWA Mem run: | nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --mapper 'bwamem' + - name: MAPPER_BT2 Test running with BowTie2 + run: | + nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --mapper 'bowtie2' --bt2_alignmode 'local' --bt2_sensitivity 'sensitive' --bt2n 1 --bt2l 16 --bt2_trim5 1 --bt2_trim3 1 - name: STRIP_FASTQ Run the basic pipeline with output unmapped reads as fastq run: | nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --strip_input_fastq diff --git a/.gitignore b/.gitignore index fb7080838..822f13d6e 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,5 @@ tests/ testing/ *.pyc main_playground.nf +.vscode +*.code-workspace diff --git a/CHANGELOG.md b/CHANGELOG.md index cda7551e8..9e25d40f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. * Added ability for automated emails using `mailutils` to also send MultiQC reports * General documentation additions and cleaning, updated figures with CC-BY license * Added large 'fullsize' dataset test-profiles for ancient fish, human, and a draft pathogen contexts. +* [#257](https://github.com/nf-core/eager/issues/257) Added the bowtie2 aligner as option for mapping, following Poullet and Orlando 2020 doi: [10.3389/fevo.2020.00105](https://doi.org/10.3389/fevo.2020.00105) ### `Fixed` diff --git a/README.md b/README.md index b8fd5308a..61e226589 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ By default the pipeline currently performs the following: * Create reference genome indices for mapping (`bwa`, `samtools`, and `picard`) * Sequencing quality control (`FastQC`) * Sequencing adapter removal and for paired end data merging (`AdapterRemoval`) -* Read mapping to reference using (`bwa aln`, `bwa mem` or `CircularMapper`) +* Read mapping to reference using (`bwa aln`, `bwa mem`, `CircularMapper`, or `bowtie2`) * Post-mapping processing, statistics and conversion to bam (`samtools`) * Ancient DNA C-to-T damage pattern visualisation (`DamageProfiler`) * PCR duplicate removal (`DeDup` or `MarkDuplicates`) @@ -66,7 +66,7 @@ Additional functionality contained by the pipeline currently includes: #### Biological Information * Mitochondrial to Nuclear read ratio calculation (`MtNucRatioCalculator`) -* Statistical sex determination of human individuals (`SexDetErrmine`) +* Statistical sex determination of human individuals (`Sex.DetERRmine`) #### Metagenomic Screening @@ -178,8 +178,10 @@ If you've contributed and you're missing in here, please let us know and we will * Vågene, Å.J. et al., 2018. Salmonella enterica genomes from victims of a major sixteenth-century epidemic in Mexico. Nature ecology & evolution, 2(3), pp.520–528. Available at: [http://dx.doi.org/10.1038/s41559-017-0446-6](http://dx.doi.org/10.1038/s41559-017-0446-6). * Herbig, A. et al., 2016. MALT: Fast alignment and analysis of metagenomic DNA sequence data applied to the Tyrolean Iceman. bioRxiv, p.050559. Available at: [http://biorxiv.org/content/early/2016/04/27/050559](http://biorxiv.org/content/early/2016/04/27/050559). * **MaltExtract** Huebler, R. et al., 2019. HOPS: Automated detection and authentication of pathogen DNA in archaeological remains. bioRxiv, p.534198. Available at: [https://www.biorxiv.org/content/10.1101/534198v1?rss=1](https://www.biorxiv.org/content/10.1101/534198v1?rss=1). Download: [https://github.com/rhuebler/MaltExtract](https://github.com/rhuebler/MaltExtract) -* **Kraken2** Wood, D et al., 2019. Improved metagenomic analysis with Kraken 2. Genome Biology volume 20, Article number: 257. Available at: [https://doi.org/10.1186/s13059-019-1891-0](https://doi.org/10.1186/s13059-019-1891-0). Download: [https://ccb.jhu.edu/software/kraken2/](https://ccb.jhu.edu/software/kraken2/) +* **Kraken2** Wood, D et al., 2019. Improved metagenomic analysis with Kraken 2. Genome Biology volume 20, Article number: 257. Available at: [https://doi.org/10.1186/s13059-019-1891-0](https://doi.org/10.1186/s13059-019-1891-0). Download: [https://ccb.jhu.edu/software/kraken2/](https://ccb.jhu.edu/software/kraken2/) * **endorS.py** Aida Andrades Valtueña (Unpublished). Download: [https://github.com/aidaanva/endorS.py](https://github.com/aidaanva/endorS.py) +* **Bowtie2** Langmead, B. and Salzberg, S. L. 2012 Fast gapped-read alignment with Bowtie 2. Nature methods, 9(4), p. 357–359. doi: [10.1038/nmeth.1923](https:/dx.doi.org/10.1038/nmeth.1923). +* **sequenceTools** Stephan Schiffels (Unpublished). Download: [https://github.com/stschiff/sequenceTools](https://github.com/stschiff/sequenceTools) ## Data References diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index fa63846cd..f0e3e1ece 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -9,6 +9,7 @@ report_comment: > run_modules: - adapterRemoval + - bowtie2 - custom_content - damageprofiler - dedup @@ -56,6 +57,7 @@ extra_fn_clean_exts: - '.unifiedgenotyper' - '.trimmed_stats' - '_libmerged' + - '_bt2' top_modules: @@ -68,8 +70,11 @@ top_modules: - 'fastqc': name: 'FastQC (post-AdapterRemoval)' path_filters: - - '*.truncated_fastqc.zip' - - '*.combined*_fastqc.zip' + - '*.truncated_fastqc.zip' + - '*.combined*_fastqc.zip' + - 'bowtie2': + path_filters: + - '*_bt2.log' - 'malt' - 'hops' - 'kraken' @@ -117,6 +122,8 @@ table_columns_visible: percent_duplicates: False total_sequences: True percent_gc: True + bowtie2: + overall_alignment_rate: True MALT: Taxonomic assignment success: False Assig. Taxonomy: False @@ -167,6 +174,8 @@ table_columns_placement: total_sequences: 400 avg_sequence_length: 410 percent_gc: 420 + Bowtie 2 / HiSAT2: + overall_alignment_rate: 450 MALT: Num. of queries: 430 Total reads: 440 @@ -181,14 +190,14 @@ table_columns_placement: Samtools Flagstat (post-samtools filter): flagstat_total: 553 mapped_passed: 554 + custom_content: + endogenous_dna: 600 + endogenous_dna_post: 610 DeDup: - mapped_after_dedup: 600 - clusterfactor: 610 + mapped_after_dedup: 620 + clusterfactor: 630 Picard: PERCENT_DUPLICATION: 650 - custom_content: - endogenous_dna: 680 - endogenous_dna_post: 690 DamageProfiler: 5 Prime1: 700 5 Prime2: 710 diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index 0ba7d4c39..1e65e59d6 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -8,11 +8,12 @@ 'Nextflow': ['v_nextflow.txt', r"(\S+)"], 'FastQC': ['v_fastqc.txt', r"FastQC v(\S+)"], 'AdapterRemoval':['v_adapterremoval.txt', r"AdapterRemoval ver. (\S+)"], - 'Picard MarkDuplicates': ['v_markduplicates.txt', r"([\d\.]+)-SNAPSHOT"], + 'Picard MarkDuplicates': ['v_markduplicates.txt', r"(\S+)"], 'Samtools': ['v_samtools.txt', r"samtools (\S+)"], 'Preseq': ['v_preseq.txt', r"Version: (\S+)"], 'MultiQC': ['v_multiqc.txt', r"multiqc, version (\S+)"], - 'BWA': ['v_bwa.txt', r"Version: (\S+)"], + 'BWA': ['v_bwa.txt', r"Version: (\S+)"], + 'Bowtie2': ['v_bowtie2.txt', r"bowtie2-([0-9]+\.[0-9]+\.[0-9]+) -fdebug"], 'Qualimap': ['v_qualimap.txt', r"QualiMap v.(\S+)"], 'GATK HaplotypeCaller': ['v_gatk.txt', r" v(\S+)"], #'GATK UnifiedGenotyper': ['v_gatk3_5.txt', r"version (\S+)"], @@ -24,7 +25,7 @@ 'circulargenerator':['v_circulargenerator.txt',r"CircularGeneratorv(\S+)"], 'DeDup':['v_dedup.txt',r"DeDup v(\S+)"], 'freebayes':['v_freebayes.txt',r"v([0-9]\S+)"], - 'Sequence Tools':['v_sequencetools.txt',r"v([0-9]\S+)"], + 'sequenceTools':['v_sequencetools.txt',r"(\S+)"], 'maltextract':['v_maltextract.txt', r"version(\S+)"], 'malt':['v_malt.txt',r"version (\S+)"], 'multivcfanalyzer':['v_multivcfanalyzer.txt', r"MultiVCFAnalyzer - (\S+)"], @@ -43,6 +44,7 @@ results['AdapterRemoval'] = 'N/A' results['fastP'] = 'N/A' results['BWA'] = 'N/A' +results['Bowtie2'] = 'N/A' results['circulargenerator'] = 'N/A' results['Samtools'] = 'N/A' results['endorS.py'] = 'N/A' diff --git a/docs/output.md b/docs/output.md index cff099d1c..fb7f5006f 100644 --- a/docs/output.md +++ b/docs/output.md @@ -2,19 +2,23 @@ ## Table of contents -* [Introduction](#introduction) -* [Directory Structure](#directory-structure) -* [MultiQC Report](#multiqc-report) - * [General Stats Table](#general-stats-table) - * [FastQC](#fastqc) - * [FastP](#fastp) - * [AdapterRemoval](#adapterremoval) - * [Samtools](#samtools) - * [DeDup](#dedup) - * [Preseq](#preseq) - * [QualiMap](#qualimap) - * [DamageProfiler](#damageprofiler) -* [Output Files](#output-files) +- [nf-core/eager: Output](#nf-coreeager-output) + - [Table of contents](#table-of-contents) + - [Introduction](#introduction) + - [Directory Structure](#directory-structure) + - [Primary Output Directories](#primary-output-directories) + - [Secondary Output Directories](#secondary-output-directories) + - [MultiQC Report](#multiqc-report) + - [General Stats Table](#general-stats-table) + - [FastQC](#fastqc) + - [FastP](#fastp) + - [AdapterRemoval](#adapterremoval) + - [Samtools](#samtools) + - [DeDup](#dedup) + - [Preseq](#preseq) + - [DamageProfiler](#damageprofiler) + - [QualiMap](#qualimap) + - [Output Files](#output-files) ## Introduction @@ -35,29 +39,29 @@ results/ work/ ``` -* The parent directory `` directory contains the (cleaned-up) output from a particular software module. This is the second most important set of directories. This contains output files such as FASTQ, BAM, statistics, and/or plot files of a specific module (see the [Output Files](#output-files) section for more detail). The latter two are only needed when you need finer detail about that particular module. +- The `MultiQC` directory is the most important directory and contains the main summary report of the run in HTML format, which can be viewed in a web-browser of your choice. The sub-diectory contains the MultiQC collected data used to build the HTML report. The Report allows you to get an overview of the sequencing and mapping quality as well as aDNA metrics (see the [MultiQC Report](#multiqc-report) section for more detail). +- A `` directory contains the (cleaned-up) output from a particular software module. This is the second most important set of directories. This contains output files such as FASTQ, BAM, statistics, and/or plot files of a specific module (see the [Output Files](#output-files) section for more detail). The latter two are only needed when you need finer detail about that particular module. ### Secondary Output Directories These are less important directories which are used less often, normally in the context of bug-reporting. -* `pipeline_info` contains back-end reporting of the pipeline itself such as run times and computational statistics. You rarely need this information other than for curiosity or when bug-reporting. -* `reference_genome` contains either text files describing the location of specified reference genomes, and if not already supplied when running the pipeline, auxilary indexing files. This is often useful when re-running other samples using the same reference genome, but is otherwise often not otherwise important. +- `pipeline_info` contains back-end reporting of the pipeline itself such as run times and computational statistics. You rarely need this information other than for curiosity or when bug-reporting. +- `reference_genome` contains either text files describing the location of specified reference genomes, and if not already supplied when running the pipeline, auxilary indexing files. This is often useful when re-running other samples using the same reference genome, but is otherwise often not otherwise important. -* The `work` directory contains all the `nextflow` processing directories. This is where `nextflow` actually does all the work, but in an efficient programmatic procedure that is not intuitive to human-readers. Due to this, the directory is often not important to a user as all the useful output files are linked to the module directories (see above). Otherwise, this directory maybe useful when a bug-reporting. +- The `work` directory contains all the `nextflow` processing directories. This is where `nextflow` actually does all the work, but in an efficient programmatic procedure that is not intuitive to human-readers. Due to this, the directory is often not important to a user as all the useful output files are linked to the module directories (see above). Otherwise, this directory maybe useful when a bug-reporting. > :warning: Note that `work/` will be created wherever you are running the `nextflow run` command from, unless you specify the location with `-w`, i.e. it will not by default be in `outdir`!. ## MultiQC Report -In this section we will run through the output of each module as reported in a MultiQC output. This can be viewed by opening the HTML file in your `/MultiQC/` directory in a web browser. The section will also provide some basic tips on how to interpret the plots and values, although we highly recommend reading the READMEs or original papers of the tools used in the pipeline. A list of references can be seen on the [EAGER2 github repository](https://github.com/nf-core/eager/) +In this section we will run through the output of each **default** module as reported in a MultiQC output. This can be viewed by opening the HTML file in your `/MultiQC/` directory in a web browser. The section will also provide some basic tips on how to interpret the plots and values, although we highly recommend reading the READMEs or original papers of the tools used in the pipeline. A list of references can be seen on the [EAGER2 github repository](https://github.com/nf-core/eager/) For more information about how to use MultiQC reports, see [http://multiqc.info](http://multiqc.info) @@ -69,32 +73,34 @@ This is the main summary table produced by MultiQC that the report begins with. #### Table -This table will report values per-file (or rather per module log-file). It will try to collapse as far as possible, if the log files have the same name - however particularly for pre-AdapterRemoval FastQC, you will likely get duplicate lines for the same sample - i.e. R1 and R2 for each library. +This table will report values per-file (or rather per module log-file). It will try to collapse as far as possible, if the log files have the same name - however particularly for pre-AdapterRemoval FastQC, you will likely get duplicate lines for the same sample - i.e. R1 and R2 for each library. Accordingly, separate libraries will be displayed separately e.g. for DamageProfiler if using TSV input and perform merging of Samples (which would be reported at the qualimap level). Each column name is supplied by the module, so you may see similar column names. When unsure, hovering over the column name will allow you see which module it is derived from. -The default columns are as follows: - -* **Sample Name** This is the log file name without the file suffix. This will depend on the module outputs. -* **Seqs** This is from Pre-AdapterRemoval FastQC. Represents the number of raw reads in your untrimmed and (paired end) unmerged FASTQ file. Each row should be approximately equal to the number of reads you requested to be sequenced, divided by the number of FASTQ files you received for that library. -* **Length** This is from Pre-AdapterRemoval FastQC. This is the average read length in your untrimmed and (paired end) unmerged FASTQ file and should represent the number of cycles of your sequencing chemistry. -* **%GC** This is from Pre-AdapterRemoval FastQC. This is the average GC content in percent of all the reads in your untrimmed and (paired end) unmerged FASTQ file. -* **GC content** This is from FastP. This is the average GC of all reads in your untrimmed and unmerged FASTSQ file after poly-G tail trimming. If you have lots of tails, this value should drop from the pre-AadapterRemoval FastQC %GC column. -* **% Trimmed** This is from AdapterRemoval. It is the percentage of reads which had an adapter sequence removed from the end of the read. -* **Seqs** This is from Post-AdapterRemoval FastQC. Represents the number of preprocessed reads in your adapter trimmed (paired end) merged FASTQ file. The loss between this number and the Pre-AdapterRemoval FastQC can give you an idea of the quality of trimming and merging. -* **%GC** This is from Post-AdapterRemoval FastQC. Represents the average GC of all preprocessed reads in your adapter trimmed (paired end) merged FASTQ file. -* **Reads Mapped** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _prior_ map quality filtering and deduplication. -* **Reads Mapped** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _after_ map quality filtering and deduplication (note the column name does not distinguish itself from prior-map quality filtering, but the post-filter column is always second) -* **Endogenous DNA (%)** This is from the endorS.py tool. It displays a percentage of mapped reads over total reads that went into mapped (i.e. the percentage DNA content of the library that matches the reference). Assuming a perfect ancient sample with no modern contamination, this would be the the amount of true ancient DNA in the sample. However this value _most likely_ include contamination and will not entirely be the true 'endogenous' content. -* **Endogenous DNA Post (%)** This is from the endorS.py tool. It displays a percentage of mapped reads _after_ BAM filtering (e.g. for mapping quality) over total reads that went into mapped (i.e. the percentage DNA content of the library that matches the reference). This column will only be displayed if BAM filtering is turned on and is based on the original mapping for total reads, and mapped reads as calculated from the post-filtering BAM. -* **Duplication Rate** This is from DeDup. This is the percentage of overall number of mapped reads that were an exact duplicate of another read. The number of reads removed by DeDup can be calculating this number by mapped reads (if no map quality filtering was applied!) -* **Coverage** This is from Qualimap. This is the median number of times a base on your reference genome was covered by a read (i.e. depth coverage).. This average includes bases with 0 reads covering that position. -* **>= 1X** to **>= 5X** These are from Qualimap. This is the percentage of the genome covered at that particular depth coverage. -* **% GC** This is the mean GC content in percent of all mapped reads post-deduplication. This should normally be close to the GC content of your reference genome. -* **% MTNUC** This is actually a ratio of the number of mitochondrial reads to the number of reads hitting autosomal chromosomes. -* **X Prime Y>Z N base** These columns are from DamageProfiler. The prime numbers represent which end of the reads the damage is referring to. The Y>Z is the type of substitution (C>T is the true damage, G>A is the complementary). You should see for no- and half- UDG treatment a decrease in frequency from the 1st to 2nd base. -* **Mean Read Length** This is from DamageProfiler. This is the mean length of all de-duplicated mapped reads. Ancient DNA normally will have a mean between 30-75, however this can vary. -* **Median Read Length** This is from DamageProfiler. This is the median length of all de-duplicated mapped reads. Ancient DNA normally will have a mean between 30-75, however this can vary. +The **default** columns are as follows: + +- **Sample Name** This is the log file name without file suffix(s). This will depend on the module outputs. +- **Seqs** This is from Pre-AdapterRemoval FastQC. Represents the number of raw reads in your untrimmed and (paired end) unmerged FASTQ file. Each row should be approximately equal to the number of reads you requested to be sequenced, divided by the number of FASTQ files you received for that library. +- **Length** This is from Pre-AdapterRemoval FastQC. This is the average read length in your untrimmed and (paired end) unmerged FASTQ file and should represent the number of cycles of your sequencing chemistry. +- **%GC** This is from Pre-AdapterRemoval FastQC. This is the average GC content in percent of all the reads in your untrimmed and (paired end) unmerged FASTQ file. +- **GC content** This is from FastP. This is the average GC of all reads in your untrimmed and unmerged FASTSQ file after poly-G tail trimming. If you have lots of tails, this value should drop from the pre-AadapterRemoval FastQC %GC column. +- **% Trimmed** This is from AdapterRemoval. It is the percentage of reads which had an adapter sequence removed from the end of the read. +- **Seqs** This is from Post-AdapterRemoval FastQC. Represents the number of preprocessed reads in your adapter trimmed (paired end) merged FASTQ file. The loss between this number and the Pre-AdapterRemoval FastQC can give you an idea of the quality of trimming and merging. +- **%GC** This is from Post-AdapterRemoval FastQC. Represents the average GC of all preprocessed reads in your adapter trimmed (paired end) merged FASTQ file. +- **Length** This is from post-AdapterRemoval FastQC. This is the average read length in your trimmed and (paired end) merged FASTQ file and should represent the 'realistic' average lengths of your DNA molecules +- **Reads Mapped** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _prior_ map quality filtering and deduplication. +- **Reads Mapped** This is from Samtools. This is the raw number of preprocessed reads mapped to your reference genome _after_ map quality filtering and deduplication (note the column name does not distinguish itself from prior-map quality filtering, but the post-filter column is always second) +- **Endogenous DNA (%)** This is from the endorS.py tool. It displays a percentage of mapped reads over total reads that went into mapped (i.e. the percentage DNA content of the library that matches the reference). Assuming a perfect ancient sample with no modern contamination, this would be the amount of true ancient DNA in the sample. However this value _most likely_ include contamination and will not entirely be the true 'endogenous' content. +- **Endogenous DNA Post (%)** This is from the endorS.py tool. It displays a percentage of mapped reads _after_ BAM filtering (e.g. for mapping quality) over total reads that went into mapped (i.e. the percentage DNA content of the library that matches the reference). This column will only be displayed if BAM filtering is turned on and is based on the original mapping for total reads, and mapped reads as calculated from the post-filtering BAM. +- **ClusterFactor** This is from DeDup. This is a value representing the how many duplicates in the library exist for each unique read. A cluster factor close to one replicates a highly complex library and could be sequenced further. Generally with a value of more than 2 you will not be gaining much more information by sequencing deeper. +- **X Prime Y>Z N base** These columns are from DamageProfiler. The prime numbers represent which end of the reads the damage is referring to. The Y>Z is the type of substitution (C>T is the true damage, G>A is the complementary). You should see for no- and half- UDG treatment a decrease in frequency from the 1st to 2nd base. +- **Mean Read Length** This is from DamageProfiler. This is the mean length of all de-duplicated mapped reads. Ancient DNA normally will have a mean between 30-75, however this can vary. +- **Median Read Length** This is from DamageProfiler. This is the median length of all de-duplicated mapped reads. Ancient DNA normally will have a mean between 30-75, however this can vary. +- **Coverage** This is from Qualimap. This is the median number of times a base on your reference genome was covered by a read (i.e. depth coverage).. This average includes bases with 0 reads covering that position. +- **>= 1X** to **>= 5X** These are from Qualimap. This is the percentage of the genome covered at that particular depth coverage. +- **% GC** This is the mean GC content in percent of all mapped reads post-deduplication. This should normally be close to the GC content of your reference genome. + +For other non-default columns, hover over the column name for further descriptions. ### FastQC @@ -110,7 +116,7 @@ For further reading and documentation see the [FastQC help](http://www.bioinform #### Sequence Counts -This shows a barplot with the overall number of sequences (x axis) in your raw library after demultiplexing, **per file** (y-axis). If you have paired end data, you will have one bar for Read 1 (or forward), and a second bar for Read 2 (or reverse). Each entire bar should represent approximately what you requested from the sequencer itself - unless you have your library sequenced over multiple lanes, where it should be what you request divided by the number of lanes it was split over. +This shows a barplot with the overall number of sequences (x axis) in your raw library after demultiplexing, **per file** (y-axis). If you have paired end data, you will have one bar for Read 1 (or forward), and a second bar for Read 2 (or reverse). Each entire bar should represent approximately what you requested from the sequencer itself - unless you have your library sequenced over multiple lanes, where it should be what you request divided by the number of lanes it was split over. A section of the bar will also show an approximate estimation of the fraction of the total number of reads that are duplicates of another. This can derive from over-amplification of the library, or lots of single adapters. This can be later checked with the Deduplication check. A good library and sequencing run should have very low amounts of duplicates reads. @@ -130,9 +136,9 @@ You will often see that the first 5 or so bases have slightly lower quality than Things to watch out for: -* all positions having Phred scores less than 27 -* a sharp drop-off of quality early in the read -* for paired-end data, if either R1 or R2 is significantly lower quality across the whole read compared to the complementary read. +- all positions having Phred scores less than 27 +- a sharp drop-off of quality early in the read +- for paired-end data, if either R1 or R2 is significantly lower quality across the whole read compared to the complementary read. #### Per Sequence Quality Scores @@ -144,8 +150,8 @@ This is a further summary of the previous plot. This is a histogram of the _over Things to watch out for: -* bi-modal peaks which suggests artefacts in some of the sequencing cycles -* all peaks being in orange or red sections which suggests an overall bad sequencing run (possibly due to a faulty flow-cell). +- bi-modal peaks which suggests artefacts in some of the sequencing cycles +- all peaks being in orange or red sections which suggests an overall bad sequencing run (possibly due to a faulty flow-cell). #### Per Base Sequencing Content @@ -159,7 +165,7 @@ You expect to see whole heatmap to be a relatively equal block of colour (normal Things to watch out for: -* If you see a particular colour becoming more prominent this suggests there is an over-representation of those bases at that base-pair range across all reads (e.g. 20-24bp). This could happen if you have lots of PCR duplicates, or poly-G tails from Illumina NextSeq/NovaSeq 2-colour chemistry data (where no fluorescence can mean both G or 'no-call'). +- If you see a particular colour becoming more prominent this suggests there is an over-representation of those bases at that base-pair range across all reads (e.g. 20-24bp). This could happen if you have lots of PCR duplicates, or poly-G tails from Illumina NextSeq/NovaSeq 2-colour chemistry data (where no fluorescence can mean both G or 'no-call'). > If you see Poly-G tails, we recommend to turn on FastP poly-G trimming with EAGER. See the 'running' documentation page for details. @@ -173,7 +179,7 @@ This line graph shows the number percentage reads (y-axis) with an average perce Things to watch out for: -* If you see particularly high percent GC content peak with NextSeq/NovaSeq data, you may have lots of PCR duplicates, or poly-G tails from Illumina NextSeq/NovaSeq 2-colour chemistry data (where no fluorescence can mean both G or 'no-call'). Consider re-running EAGER2 using the poly-G trimming option from `fastp` See the 'running' documentation page for details. +- If you see particularly high percent GC content peak with NextSeq/NovaSeq data, you may have lots of PCR duplicates, or poly-G tails from Illumina NextSeq/NovaSeq 2-colour chemistry data (where no fluorescence can mean both G or 'no-call'). Consider re-running EAGER2 using the poly-G trimming option from `fastp` See the 'running' documentation page for details. #### Per Base N Content @@ -247,7 +253,7 @@ After filtering, you should see that the average GC content along the reads is n Things to look out for: -* If you see a distinct GC content increase at the end of the reads, but are not removed after filtering, check to see where along the read the increase seems to start. If it is less than 10 base pairs from the end, consider reducing the overlap parameter `--complexity_filter_poly_g_min`, which tells FastP how far in the read the Gs need to go before removing them. +- If you see a distinct GC content increase at the end of the reads, but are not removed after filtering, check to see where along the read the increase seems to start. If it is less than 10 base pairs from the end, consider reducing the overlap parameter `--complexity_filter_poly_g_min`, which tells FastP how far in the read the Gs need to go before removing them. ### AdapterRemoval @@ -255,14 +261,14 @@ Things to look out for: AdapterRemoval a tool that does the post-sequencing clean up of your sequencing reads. It performs the following functions -* 'Merges' (or 'collapses') forward and reverse reads of Paired End data -* Removes remaining library indexing adapters -* Trims low quality base tails from ends of reads -* Removes too-short reads +- 'Merges' (or 'collapses') forward and reverse reads of Paired End data +- Removes remaining library indexing adapters +- Trims low quality base tails from ends of reads +- Removes too-short reads In more detail merging is where the same read from the forward and reverse files of a single library (based on the flowcell coordinates), are compared to find a stretch of sequence that are the same. If this overlap reaches certain quality thresholds, the two reads are 'collapsed' into a single read, with the base quality scores are updated accordingly accounting for the increase quality call precision. -Adapter removal involves finding overlaps at the 5' and 3' end of reads for the artificial NGS library adapters (which connect the DNA molecule insert, and the index), and stretches that match each other are then removed from the read itself. Note, by default AdapterRemoval does _not_ remove 'internal barcodes' (between insert and the adapter), so these statistics are not considered. +Adapter removal involves finding overlaps at the 5' and 3' end of reads for the artificial NGS library adapters (which connect the DNA molecule insert, and the index), and stretches that match each other are then removed from the read itself. Note, by default AdapterRemoval does _not_ remove 'internal barcodes' (between insert and the adapter), so these statistics are not considered. Quality trimming (or 'truncating') involves looking at ends of reads for low-confidence bases (i.e. where the FASTQ Phred score is below a certain threshold). These are then removed remove the read. @@ -276,10 +282,10 @@ The most important value is the **Retained Read Pairs** which gives you the fina Other Categories: -* If paired-end, the **Singleton [mate] R1(/R2)** categories represent reads which were unable to be collapsed, possibly due to the reads being too long to overlap. -* If paired-end, **Full-length collapsed pairs** are reads which were collapsed and did not require low-quality bases at end of reads to be removed. -* If paired-end, **Truncated collapsed pairs** are paired-end that were collapsed but did required the removal of low quality bases at the end of reads. -* **Discarded [mate] R1/R2** represent reads which were a part of a pair, but one member of the pair did not reach other quality criteria and was discarded. However the other member of the pair is still retained in the output file as it still reached other quality criteria. +- If paired-end, the **Singleton [mate] R1(/R2)** categories represent reads which were unable to be collapsed, possibly due to the reads being too long to overlap. +- If paired-end, **Full-length collapsed pairs** are reads which were collapsed and did not require low-quality bases at end of reads to be removed. +- If paired-end, **Truncated collapsed pairs** are paired-end that were collapsed but did required the removal of low quality bases at the end of reads. +- **Discarded [mate] R1/R2** represent reads which were a part of a pair, but one member of the pair did not reach other quality criteria and was discarded. However the other member of the pair is still retained in the output file as it still reached other quality criteria.

@@ -293,11 +299,11 @@ If you see high numbers of discarded or truncated reads, you should check your F The length distribution plots show the number of reads at each read-length. You can change the plot to display different categories. -* All represent the overall distribution of reads. In the case of paired-end sequencing You may see a peak at the turn around from forward to reverse cycles. -* **Mate 1** and **Mate 2** represents the length of the forward and reverse read respectively prior collapsing -* **Singleton** represent those reads that had a one member of a pair discarded -* **Collapsed** and **Collapsed Truncated** represent reads that overlapped and able to merge into a single read, with the latter including base-quality trimming off ends of reads. These plots will start with a vertical rise representing where you are above the minimum-read threshold you set. -* **Discarded** here represents the number of reads that did not each the read length filter. You will likely see a vertical drop at what your threshold was set to. +- All represent the overall distribution of reads. In the case of paired-end sequencing You may see a peak at the turn around from forward to reverse cycles. +- **Mate 1** and **Mate 2** represents the length of the forward and reverse read respectively prior collapsing +- **Singleton** represent those reads that had a one member of a pair discarded +- **Collapsed** and **Collapsed Truncated** represent reads that overlapped and able to merge into a single read, with the latter including base-quality trimming off ends of reads. These plots will start with a vertical rise representing where you are above the minimum-read threshold you set. +- **Discarded** here represents the number of reads that did not each the read length filter. You will likely see a vertical drop at what your threshold was set to.

@@ -337,15 +343,15 @@ DeDup is a duplicate removal tool which searches for PCR duplicates and removes This stacked bar plot shows as a whole the total number of reads in the BAM file going into DeDup. The different sections of a given bar represents the following: -* **Not Removed** - the overall number of reads remaining after duplicate removal. These may have had a duplicate (see below). -* **Reverse Removed** - the number of reads that found to be a duplicate of another and removed that were un-collapsed reverse reads (from the earlier read merging step). -* **Forward Removed** - the number of reads that found to be a duplicate of another and removed that were an un-collapsed forward reads (from the earlier read merging step). -* **Merged Removed** - the number of reads that were found to be a duplicate and removed that were a collapsed read (from the earlier read merging step). +- **Not Removed** - the overall number of reads remaining after duplicate removal. These may have had a duplicate (see below). +- **Reverse Removed** - the number of reads that found to be a duplicate of another and removed that were un-collapsed reverse reads (from the earlier read merging step). +- **Forward Removed** - the number of reads that found to be a duplicate of another and removed that were an un-collapsed forward reads (from the earlier read merging step). +- **Merged Removed** - the number of reads that were found to be a duplicate and removed that were a collapsed read (from the earlier read merging step). Exceptions to the above: -* If you do not have paired end data, you will not have sections for 'Merged removed' or 'Reverse removed'. -* If you use the `--dedup_all_merged` flag, you will not have the 'Forward removed' or 'Reverse removed' sections. +- If you do not have paired end data, you will not have sections for 'Merged removed' or 'Reverse removed'. +- If you use the `--dedup_all_merged` flag, you will not have the 'Forward removed' or 'Reverse removed' sections.

@@ -353,8 +359,8 @@ Exceptions to the above: Things to look out for: -* The smaller the number of the duplicates removed the better. If you have a smaller number of duplicates, and wish to sequence deeper, you can use the preseq module (see below) to make an estimate on how much deeper to sequence. -* If you have a very large number of duplicates that were removed this may suggest you have an over amplified library, or a lot of left-over adapters that were able to map to your genome. +- The smaller the number of the duplicates removed the better. If you have a smaller number of duplicates, and wish to sequence deeper, you can use the preseq module (see below) to make an estimate on how much deeper to sequence. +- If you have a very large number of duplicates that were removed this may suggest you have an over amplified library, or a lot of left-over adapters that were able to map to your genome. ### Preseq @@ -378,9 +384,50 @@ The dashed line represents a 'perfect' library containing only unique molecules Plateauing can be caused by a number of reasons: -* You have simply sequenced your library to exhaustion -* You have an over-amplified library with many PCR duplicates. You should consider rebuilding the library to maximise data to cost ratio -* You have a low quality library made up of mappable sequencing artefacts that were able to pass filtering (e.g. adapters) +- You have simply sequenced your library to exhaustion +- You have an over-amplified library with many PCR duplicates. You should consider rebuilding the library to maximise data to cost ratio +- You have a low quality library made up of mappable sequencing artefacts that were able to pass filtering (e.g. adapters) + +### DamageProfiler + +#### Background + +DamageProfiler is a tool which calculates a variety of standard 'aDNA' metrics from a BAM file. The primary plots here are the misincorporation and length distribution plots. Ancient DNA undergoes depurination and hydrolysis, causing fragmentation of molecules into gradually shorter fragments, and cytosine to thymine deamination damage, that occur on the subsequent single-stranded overhangs at the ends of molecules. + +Therefore, three main characteristics of ancient DNA are: + +- Short DNA fragments +- Elevated G and As (purines) just before strand breaks +- Increased C and Ts at ends of fragments + +#### Misincorporation Plots + +The MultiQC DamageProfiler module misincorporation plots shows the percent frequency (Y axis) of C to T mismatches at 5' read ends and complementary G to A mismatches at the 3' ends. The X axis represents base pairs from the end of the molecule from the given prime end, going into the middle of the molecule i.e. 1st base of molecule, 2nd base of molecule etc until the 14th base pair. The mismatches are when compared to the base of the reference genome at that position. + +When looking at the misincorporation plots, keep the following in mind: + +- As few-base single-stranded overhangs are more likely to occur than long overhangs, we expect to see a gradual decrease in the frequency of the modifications from position 1 to the inside of the reads. +- If your library has been **partially-UDG treated**, only the first one or two bases will display the misincorporation frequency. +- If your library has been **UDG treated** you will expect to see extremely-low to no misincorporations at read ends. +- If your library is **single-stranded**, you will expect to see only C to T misincorporations at both 5' and 3' ends of the fragments. +- We generally expect that the older the sample, or the less-ideal preservational environtment (hot/wet) the greater the frequency of C to T/G to A. +- The curve will be not smooth then you have few reads informing the frequency calculation. Read counts of less than 500 are likely not reliable. + +

+ +

+ +> **NB:** An important difference to note compared to the MapDamage tool, which DamageProfiler is an exact-reimplementation of, is that the percent frequency on the Y axis is not fixed between 0 and 0.3, and will 'zoom' into small values the less damage there is + +#### Length Distribution + +The MultiQC DamageProfiler module length distribution plots show the frequency of read lengths across forward and reverse reads respectively. + +When looking at the length distribution plots, keep in mind the following: + +- Your curves will likely not start at 0, and will start wherever your minimum read-length setting was when removing adapters. +- You should typically see the bulk of the distribution falling between 40-120bp, which is normal for aDNA +- You may see large peaks at paired-end turn-arounds, due to very-long reads that could not overlap for merging being present, however this reads are normally from modern contamination. ### QualiMap @@ -388,7 +435,7 @@ Plateauing can be caused by a number of reasons: Qualimap is a tool which provides statistics on the quality of the mapping of your reads to your reference genome. It allows you to assess how well covered your reference genome is by your data, both in 'fold' depth (average number of times a given base on the reference is covered by a read) and 'percentage' (the percentage of all bases on the reference genome that is covered at a given fold depth). These outputs allow you to make decision if you have enough quality data for downstream applications like genotyping, and how to adjust the parameters for those tools accordingly. -> NB: Neither fold coverage nor percent coverage on there own is sufficient enough to asssess whether you have a high quality mapping. Abnormally high fold coverages of a smaller region such as highly conserved genes or unremoved-adapter-containing reference genomes can artifically inflate the mean coverage, yet a high percent coverage is not useful if all bases of the genome are covered at just 1x coverage. +> NB: Neither fold coverage nor percent coverage on there own is sufficient to assess whether you have a high quality mapping. Abnormally high fold coverages of a smaller region such as highly conserved genes or un-removed-adapter-containing reference genomes can artificially inflate the mean coverage, yet a high percent coverage is not useful if all bases of the genome are covered at just 1x coverage. Note that many of the statistics from this module are displayed in the General Stats table (see above), as they represent single values that are not plottable. @@ -404,8 +451,8 @@ The greater the number of bases covered at as high as possible fold coverage, th Things to watch out for: -* You will typically see a direct decay from the lowest coverage to higher. A large range of coverages along the X axis is potentially suspicious. -* If you have stacking of reads i.e. a small region with an abnormally large amount of reads despite the rest of the reference being quite shallowly covered, this will artificially increase your coverage. This would be represented by a small peak that is a much further along the X axis away from the main distribution of reads. +- You will typically see a direct decay from the lowest coverage to higher. A large range of coverages along the X axis is potentially suspicious. +- If you have stacking of reads i.e. a small region with an abnormally large amount of reads despite the rest of the reference being quite shallowly covered, this will artificially increase your coverage. This would be represented by a small peak that is a much further along the X axis away from the main distribution of reads. #### Cumulative Genome Coverage @@ -427,77 +474,36 @@ This plot shows the distirbution of th frequency of reads at different GC conten Things to watch out for: -* This plot should normally show a normal distribution around the average GC content of your reference genome. -* Binomial peaks may represent lab-based artefacts that should be further investigated. -* Skews of the peak to a higher GC content that the reference in Illumina dual-colour chemistry data (e.g. NextSeq or NovaSeq), may suggest long poly-G tails that are mapping to poly-G stretches of your genome. The EAGER2 trimming option `--complexity_filter_poly_g` can be used to remove these tails by utilising the tool FastP for detection and trimming. - -### DamageProfiler - -#### Background - -DamageProfiler is a tool which calculates a variety of standard 'aDNA' metrics from a BAM file. The primary plots here are the misincorporation and length distribution plots. Ancient DNA undergoes depurination and hydrolysis, causing fragmentation of molecules into gradually shorter fragments, and cytosine to thymine deamination damage, that occur on the subsequent single-stranded overhangs at the ends of molecules. - -Therefore, three main characteristics of ancient DNA are: - -* Short DNA fragments -* Elevated G and As (purines) just before strand breaks -* Increased C and Ts at ends of fragments - -#### Misincorporation Plots - -The MultiQC DamageProfiler module misincorporation plots shows the percent frequency (Y axis) of C to T mismatches at 5' read ends and complementary G to A mismatches at the 3' ends. The X axis represents base pairs from the end of the molecule from the given prime end, going into the middle of the molecule i.e. 1st base of molecule, 2nd base of molecule etc until the 14th base pair. The mismatches are when compared to the base of the reference genome at that position. - -When looking at the misincorporation plots, keep the following in mind: - -* As few-base single-stranded overhangs are more likely to occur than long overhangs, we expect to see a gradual decrease in the frequency of the modifications from position 1 to the inside of the reads. -* If your library has been **partially-UDG treated**, only the first one or two bases will display the the misincorporation frequency. -* If your library has been **UDG treated** you will expect to see extremely-low to no misincorporations at read ends. -* If your library is **single-stranded**, you will expect to see only C to T misincorporations at both 5' and 3' ends of the fragments. -* We generally expect that the older the sample, or the less-ideal preservational environtment (hot/wet) the greater the frequency of C to T/G to A. -* The curve will be not smooth then you have few reads informing the frequency calculation. Read counts of less than 500 are likely not reliable. - -

- -

- -> **NB:** An important difference to note compared to the MapDamage tool, which DamageProfiler is an exact-reimplementation of, is that the percent frequency on the Y axis is not fixed between 0 and 0.3, and will 'zoom' into small values the less damage there is - -#### Length Distribution - -The MultiQC DamageProfiler module length distribution plots show the frequency of read lengths across forward and reverse reads respectively. - -When looking at the length distribution plots, keep in mind the following: - -* Your curves will likely not start at 0, and will start wherever your minimum read-length setting was when removing adapters. -* You should typically see the bulk of the distribution falling between 40-120bp, which is normal for aDNA -* You may see large peaks at paired-end turn-arounds, due to very-long reads that could not overlap for merging being present, however this reads are normally from modern contamination. +- This plot should normally show a normal distribution around the average GC content of your reference genome. +- Bimodal peaks may represent lab-based artefacts that should be further investigated. +- Skews of the peak to a higher GC content that the reference in Illumina dual-colour chemistry data (e.g. NextSeq or NovaSeq), may suggest long poly-G tails that are mapping to poly-G stretches of your genome. The EAGER2 trimming option `--complexity_filter_poly_g` can be used to remove these tails by utilising the tool FastP for detection and trimming. ## Output Files -This section gives a brief summary of where to look for what files for downstream analysis. +This section gives a brief summary of where to look for what files for downstream analysis. This covers *all* modules. Each module has it's own output directory which sit alongside the `MultiQC/` directory from which you opened the report. -* `reference_genome/` - this directory contains the indexing files of your input reference genome (i.e. the various `bwa` indices, a `samtools`' `.fai` file, and a picard `.dict`), if you used the `--saveReference` flag. -* `FastQC/` - this contains the original per-FASTQ FastQC reports that are summarised with MultiQC. These occur in both `html` (the report) and `.zip` format (raw data). The `after_clipping` folder contains the same but for after AdapterRemoval. -* `AdapterRemoval/` - this contains the log files (ending with `.settings`) with raw trimming (and merging) statistics after AdapterRemoval. In the `output` sub-directory, are the output trimmed (and merged) FASTQ files. These you can use for downstream applications such as taxonomic binning for metagenomic studies. -* `mapping/` - this contains a sub-directory corresponding to the mapping tool you used, inside of which will be the initial BAM files containing the reads that mapped to your reference genome with no modification (see below). You will also find a corresponding BAM index file (ending in `.csi` or `.bam`). You can use these for downstream applications e.g. if you wish to use a different de-duplication tool not included in EAGER2 (although please feel free to add a new module request on the Github repository's [issue page](https://github.com/nf-core/eager/issues)!). -* `samtools/` - this contains two sub-directories. `stats/` contain the raw mapping statistics files (ending in `.stats`) from directly after mapping. `filter/` contains BAM files that have had a mapping quality filter applied (set by the `--bam_mapping_quality_threshold` flag) and a corresponding index file. Furthermore, if you selected `--bam_discard_unmapped`, you will find your separate file with only unmapped reads in the format you selected. Note unmapped read BAM files will _not_ have an index file. -* `deduplication/` - this contains a sub-directory called `dedup/`, inside here are sample specific directories. Each directory contains a BAM file containing mapped reads but with PCR duplicates removed, a corresponding index file and two stats file. `.hist.` contains raw data for a deduplication histogram used for tools like preseq (see below), and the `.log` contains overall summary deduplication statistics. -* `endorSpy/` - this contains all JSON files exported from the endorSpy endogenous DNA calculation tool. The JSON files are generated specifically for display in the MultiQC general statistics table and is otherwise very likely not useful for you. -* `preseq/` - this contains a `.ccurve` file for every BAM file that had enough deduplication statistics to generate a complexity curve for estimating the amount unique reads that will be yield if the library is re-sequenced. You can use this file for plotting e.g. in `R` to find your sequencing target depth. -* `qualimap/` - this contains a sub-directory for every sample, which includes a qualimap report and associated raw statistic files. You can open the `.html` file in your internet browser to see the in-depth report (this will be more detailed than in MultiQC). This includes stuff like percent coverage, depth coverage, GC content and so on of your mapped reads. -* `damageprofiler/` - this contains sample specific directories containing raw statistics and damage plots from DamageProfiler. The `.pdf` files can be used to visualise C to T miscoding lesions or read length distributions of your mapped reads. All raw statistics used for the PDF plots are contained in the `.txt` files. -* `pmdtools/` this contains raw output statistics of pmdtools (estimates of frequencies of substitutions), and BAM files which have been filtered to remove reads that do not have a Post-mortem damage (PMD) score of `--pmdtools_threshold`. The BAM files do not have corresponding index files. -* `trimmed_bam/` this contains the BAM files with X number of bases trimmed off as defined with the `--bamutils_clip_left` and `--bamutils_clip_right` flags and corresponding index files. You can use these BAM files for downstream analysis such as re-mapping data with more stringent parameters (if you set trimming to remove the most likely places containing damage in the read). -* `genotyping/` this contains all the (gzipped) VCF files produced by your genotyping module. The file suffix will have the genotyping tool name. You will have VCF files corresponding to your deduplicated BAM files, as well as any turned-on downstream processes that create BAMs (e.g. trimmed bams or pmd tools). If `--gatk_ug_keep_realign_bam` supplied, this may also contain BAM files from InDel realignment when using GATK 3 and UnifiedGenotyping for variant calling. -* `MultiVCFAnalyzer/` this contains all output from MultiVCFAnalyzer, including SNP calling statistics, various SNP table(s) and FASTA alignment files. -* `sex_determination/` this contains the output for the sex determination run. This is a single `.tsv` file that includes a table with the Sample Name, the Nr of Autosomal SNPs, Nr of SNPs on the X/Y chromosome, the Nr of reads mapping to the Autosomes, the Nr of reads mapping to the X/Y chromosome, the relative coverage on the X/Y chromosomes, and the standard error associated with the relative coverages. These measures are provided for each bam file, one row per bam. If the `sexdeterrmine_bedfile` option has not been provided, the error bars cannot be trusted, and runtime will be considerably longer. -* `nuclear_contamination/` this contains the output of the nuclear contamination processes. The directory contains one `*.X.contamination.out` file per individual, as well as `nuclear_contamination.txt` which is a summary table of the results for all individual. `nuclear_contamination.txt` contains a header, followed by one line per individual, comprised of the Method of Moments (MOM) and Maximum Likelihood (ML) contamination estimate (with their respective standard errors) for both Method1 and Method2. -* `bedtools/` this contains two files as the output from bedtools coverage. One file contains the 'breadth' coverage (`*.breadth.gz`). This file will have the contents of your annotation file (e.g. BED/GFF), and the following subsequent columns: no. reads on feature, # bases at depth, length of feature, and % of feature. The second file (`*.depth.gz`), contains the contents of your annotation file (e.g. BED/GFF), and an additional column which is mean depth coverage (i.e. average number of reads covering each position). -* `metagenomic_classification/` This contains the output for a given metagenomic classifier. - * Malt will contain RMA6 files that can be loaded into MEGAN6 or MaltExtract for phylogenetic visualisation of read taxonomic assignments and aDNA characteristics respectively. Additional a `malt.log` file is provided which gives additional information such as run-time, memory usage and per-sample statistics of numbers of alignments with taxonomic assignment etc. - * Kraken will contain the Kraken output and report files, as well as a merged Taxon count table. -* `MaltExtract/` this will contain a `results` directory in which contains the output from MaltExtract - typically one folder for each filter type, an error and a log file. The characteristics of each node (e.g. damage, read lengths, edit distances - each in different txt formats) can be seen in each sub-folder of the filter folders. Output can be visualised either with the [HOPS postprocessing script](https://github.com/rhuebler/HOPS) or [MEx-IPA](https://github.com/jfy133/MEx-IPA) -* `consensus_sequence/` this contains three FASTA files from VCF2Genome, of a consensus sequence based on the reference FASTA with each sample's unique modifications. The main FASTA is a standard file with bases not passing the specified thresholds as Ns. The two other FASTAS (`_refmod.fasta.gz`) and (`_uncertainity.fasta.gz`) are IUPAC uncertainty codes (rather than Ns) and a special number-based uncertainty system used for other downstream tools, respectively. -* `librarymerged_bams/` these contain the final BAM files that would go into genotyping (if genotyping is turned on). This means the BAM will contain all libraries of a given sample (including trimmed non-UDG or half-UDG treated libraries, if BAM trimming turned on) +- `reference_genome/` - this directory contains the indexing files of your input reference genome (i.e. the various `bwa` indices, a `samtools`' `.fai` file, and a picard `.dict`), if you used the `--saveReference` flag. +- `FastQC/` - this contains the original per-FASTQ FastQC reports that are summarised with MultiQC. These occur in both `html` (the report) and `.zip` format (raw data). The `after_clipping` folder contains the same but for after AdapterRemoval. +- `AdapterRemoval/` - this contains the log files (ending with `.settings`) with raw trimming (and merging) statistics after AdapterRemoval. In the `output` sub-directory, are the output trimmed (and merged) FASTQ files. These you can use for downstream applications such as taxonomic binning for metagenomic studies. +- `mapping/` - this contains a sub-directory corresponding to the mapping tool you used, inside of which will be the initial BAM files containing the reads that mapped to your reference genome with no modification (see below). You will also find a corresponding BAM index file (ending in `.csi` or `.bam`), and if running the `bowtie2` mapper - a log ending in `_bt2.log`. You can use these for downstream applications e.g. if you wish to use a different de-duplication tool not included in nf-core/eager (although please feel free to add a new module request on the Github repository's [issue page](https://github.com/nf-core/eager/issues)!). +- `samtools/` - this contains two sub-directories. `stats/` contain the raw mapping statistics files (ending in `.stats`) from directly after mapping. `filter/` contains BAM files that have had a mapping quality filter applied (set by the `--bam_mapping_quality_threshold` flag) and a corresponding index file. Furthermore, if you selected `--bam_discard_unmapped`, you will find your separate file with only unmapped reads in the format you selected. Note unmapped read BAM files will _not_ have an index file. +- `deduplication/` - this contains a sub-directory called `dedup/`, inside here are sample specific directories. Each directory contains a BAM file containing mapped reads but with PCR duplicates removed, a corresponding index file and two stats file. `.hist.` contains raw data for a deduplication histogram used for tools like preseq (see below), and the `.log` contains overall summary deduplication statistics. +- `endorSpy/` - this contains all JSON files exported from the endorSpy endogenous DNA calculation tool. The JSON files are generated specifically for display in the MultiQC general statistics table and is otherwise very likely not useful for you. +- `preseq/` - this contains a `.ccurve` file for every BAM file that had enough deduplication statistics to generate a complexity curve for estimating the amount unique reads that will be yield if the library is re-sequenced. You can use this file for plotting e.g. in `R` to find your sequencing target depth. +- `qualimap/` - this contains a sub-directory for every sample, which includes a qualimap report and associated raw statistic files. You can open the `.html` file in your internet browser to see the in-depth report (this will be more detailed than in MultiQC). This includes stuff like percent coverage, depth coverage, GC content and so on of your mapped reads. +- `damageprofiler/` - this contains sample specific directories containing raw statistics and damage plots from DamageProfiler. The `.pdf` files can be used to visualise C to T miscoding lesions or read length distributions of your mapped reads. All raw statistics used for the PDF plots are contained in the `.txt` files. +- `pmdtools/` this contains raw output statistics of pmdtools (estimates of frequencies of substitutions), and BAM files which have been filtered to remove reads that do not have a Post-mortem damage (PMD) score of `--pmdtools_threshold`. The BAM files do not have corresponding index files. +- `trimmed_bam/` this contains the BAM files with X number of bases trimmed off as defined with the `--bamutils_clip_left` and `--bamutils_clip_right` flags and corresponding index files. You can use these BAM files for downstream analysis such as re-mapping data with more stringent parameters (if you set trimming to remove the most likely places containing damage in the read). +- `genotyping/` this contains all the (gzipped) VCF files produced by your genotyping module. The file suffix will have the genotyping tool name. You will have VCF files corresponding to your deduplicated BAM files, as well as any turned-on downstream processes that create BAMs (e.g. trimmed bams or pmd tools). If `--gatk_ug_keep_realign_bam` supplied, this may also contain BAM files from InDel realignment when using GATK 3 and UnifiedGenotyping for variant calling. +- `MultiVCFAnalyzer/` this contains all output from MultiVCFAnalyzer, including SNP calling statistics, various SNP table(s) and FASTA alignment files. +- `sex_determination/` this contains the output for the sex determination run. This is a single `.tsv` file that includes a table with the Sample Name, the Nr of Autosomal SNPs, Nr of SNPs on the X/Y chromosome, the Nr of reads mapping to the Autosomes, the Nr of reads mapping to the X/Y chromosome, the relative coverage on the X/Y chromosomes, and the standard error associated with the relative coverages. These measures are provided for each bam file, one row per bam. If the `sexdeterrmine_bedfile` option has not been provided, the error bars cannot be trusted, and runtime will be considerably longer. +- `nuclear_contamination/` this contains the output of the nuclear contamination processes. The directory contains one `*.X.contamination.out` file per individual, as well as `nuclear_contamination.txt` which is a summary table of the results for all individual. `nuclear_contamination.txt` contains a header, followed by one line per individual, comprised of the Method of Moments (MOM) and Maximum Likelihood (ML) contamination estimate (with their respective standard errors) for both Method1 and Method2. +- `bedtools/` this contains two files as the output from bedtools coverage. One file contains the 'breadth' coverage (`*.breadth.gz`). This file will have the contents of your annotation file (e.g. BED/GFF), and the following subsequent columns: no. reads on feature, # bases at depth, length of feature, and % of feature. The second file (`*.depth.gz`), contains the contents of your annotation file (e.g. BED/GFF), and an additional column which is mean depth coverage (i.e. average number of reads covering each position). +- `metagenomic_classification/` This contains the output for a given metagenomic classifier. + - Malt will contain RMA6 files that can be loaded into MEGAN6 or MaltExtract for phylogenetic visualisation of read taxonomic assignments and aDNA characteristics respectively. Additional a `malt.log` file is provided which gives additional information such as run-time, memory usage and per-sample statistics of numbers of alignments with taxonomic assignment etc. + - Kraken will contain the Kraken output and report files, as well as a merged Taxon count table. +- `MaltExtract/` this will contain a `results` directory in which contains the output from MaltExtract - typically one folder for each filter type, an error and a log file. The characteristics of each node (e.g. damage, read lengths, edit distances - each in different txt formats) can be seen in each sub-folder of the filter folders. Output can be visualised either with the [HOPS postprocessing script](https://github.com/rhuebler/HOPS) or [MEx-IPA](https://github.com/jfy133/MEx-IPA) +- `consensus_sequence/` this contains three FASTA files from VCF2Genome, of a consensus sequence based on the reference FASTA with each sample's unique modifications. The main FASTA is a standard file with bases not passing the specified thresholds as Ns. The two other FASTAS (`_refmod.fasta.gz`) and (`_uncertainity.fasta.gz`) are IUPAC uncertainty codes (rather than Ns) and a special number-based uncertainty system used for other downstream tools, respectively. +- `librarymerged_bams/` these contain the final BAM files that would go into genotyping (if genotyping is turned on). This means the BAM will contain all libraries of a given sample (including trimmed non-UDG or half-UDG treated libraries, if BAM trimming turned on) diff --git a/docs/usage.md b/docs/usage.md index 8430a3883..59ea4f2cf 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -10,176 +10,30 @@ - [Running the pipeline](#running-the-pipeline) - [Updating the pipeline](#updating-the-pipeline) - [Mandatory Arguments](#mandatory-arguments) - - [`-profile`](#-profile) - - [`--input`](#--input) - - [Direct Input Method](#direct-input-method) - - [TSV Input Method](#tsv-input-method) - - [`--bam`](#--bam) - - [`--single_stranded`](#--single_stranded) - - [`--colour_chemistry`](#--colour_chemistry) - - [`--fasta`](#--fasta) - - [`--genome` (using iGenomes)](#--genome-using-igenomes) - [Output Directories](#output-directories) - - [`--outdir`](#--outdir) - - [`-w / -work-dir`](#-w---work-dir) - [Optional Reference Options](#optional-reference-options) - - [`--large_ref`](#--large_ref) - - [`--save_reference`](#--save_reference) - - [`--bwa_index`](#--bwa_index) - - [`--seq_dict`](#--seq_dict) - - [`--fasta_index`](#--fasta_index) - [Other run specific parameters](#other-run-specific-parameters) - - [`-r`](#-r) - - [`--max_memory`](#--max_memory) - - [`--max_time`](#--max_time) - - [`--max_cpus`](#--max_cpus) - - [`--email`](#--email) - - [`--plaintext_email`](#--plaintext_email) - - [`-name`](#-name) - - [`-resume`](#-resume) - - [`-c`](#-c) - - [`--monochrome_logs`](#--monochrome_logs) - - [`--multiqc_config`](#--multiqc_config) - - [`--custom_config_version`](#--custom_config_version) - [Adjustable parameters for nf-core/eager](#adjustable-parameters-for-nf-coreeager) - [Step skipping parameters](#step-skipping-parameters) - - [`--skip_fastqc`](#--skip_fastqc) - - [`--skip_adapterremoval`](#--skip_adapterremoval) - - [`--skip_preseq`](#--skip_preseq) - - [`--skip_deduplication`](#--skip_deduplication) - - [`--skip_damage_calculation`](#--skip_damage_calculation) - - [`--skip_qualimap`](#--skip_qualimap) - [BAM Conversion Options](#bam-conversion-options) - - [`--run_convertinputbam`](#--run_convertinputbam) - [Complexity Filtering Options](#complexity-filtering-options) - - [`--complexity_filter_poly_g`](#--complexity_filter_poly_g) - - [`--complexity_filter_poly_g_min`](#--complexity_filter_poly_g_min) - [Adapter Clipping and Merging Options](#adapter-clipping-and-merging-options) - - [`--clip_forward_adaptor`](#--clip_forward_adaptor) - - [`--clip_reverse_adaptor`](#--clip_reverse_adaptor) - - [`--clip_readlength](#--clip_readlength) - - [`--clip_min_read_quality`](#--clip_min_read_quality) - - [`--clip_min_adap_overlap`](#--clip_min_adap_overlap) - - [`--skip_collapse`](#--skip_collapse) - - [`--skip_trim`](#--skip_trim) - - [`--preserve5p`](#--preserve5p) - - [`--mergedonly`](#--mergedonly) - [Read Mapping Parameters](#read-mapping-parameters) - - [`--mapper`](#--mapper) - - [BWA (default)](#bwa-default) - - [`--bwaalnn`](#--bwaalnn) - - [`--bwaalnk`](#--bwaalnk) - - [`--bwaalnl`](#--bwaalnl) - - [CircularMapper](#circularmapper) - - [`--circularextension`](#--circularextension) - - [`--circulartarget`](#--circulartarget) - - [`--circularfilter`](#--circularfilter) - [Mapped Reads Stripping](#mapped-reads-stripping) - - [`--strip_input_fastq`](#--strip_input_fastq) - - [`--strip_mode`](#--strip_mode) - [Read Filtering and Conversion Parameters](#read-filtering-and-conversion-parameters) - - [`--run_bam_filtering`](#--run_bam_filtering) - - [`--bam_discard_unmapped`](#--bam_discard_unmapped) - - [`--bam_unmapped_type`](#--bam_unmapped_type) - - [`--bam_mapping_quality_threshold`](#--bam_mapping_quality_threshold) - [Read DeDuplication Parameters](#read-deduplication-parameters) - - [`--dedupper`](#--dedupper) - - [`--dedup_all_merged`](#--dedup_all_merged) - [Library Complexity Estimation Parameters](#library-complexity-estimation-parameters) - - [`--preseq_step_size`](#--preseq_step_size) - [DNA Damage Assessment Parameters](#dna-damage-assessment-parameters) - - [`--udg_type`](#--udg_type) - - [`--damageprofiler_length`](#--damageprofiler_length) - - [`--damageprofiler_threshold`](#--damageprofiler_threshold) - - [`--damageprofiler_yaxis`](#--damageprofiler_yaxis) - - [`--run_pmdtools`](#--run_pmdtools) - - [`--pmdtools_range`](#--pmdtools_range) - - [`--pmdtools_threshold`](#--pmdtools_threshold) - - [`--pmdtools_reference_mask`](#--pmdtools_reference_mask) - - [`--pmdtools_max_reads`](#--pmdtools_max_reads) - [BAM Trimming Parameters](#bam-trimming-parameters) - - [`--run_trim_bam`](#--run_trim_bam) - - [`--bamutils_clip_left` / `--bamutils_clip_right`](#--bamutils_clip_left----bamutils_clip_right) - - [`--bamutils_softclip`](#--bamutils_softclip) - [Captured Library Parameters](#captured-library-parameters) - - [`--snpcapture` false](#--snpcapture-false) - - [`--bedfile`](#--bedfile) - [Feature Annotation Statistics](#feature-annotation-statistics) - - [`--run_bedtools_coverage`](#--run_bedtools_coverage) - - [`--anno_file`](#--anno_file) - [Genotyping Parameters](#genotyping-parameters) - - [`--run_genotyping`](#--run_genotyping) - - [`--genotyping_tool`](#--genotyping_tool) - - [`--genotyping_source`](#--genotyping_source) - - [`--gatk_ug_jar`](#--gatk_ug_jar) - - [`--gatk_call_conf`](#--gatk_call_conf) - - [`--gatk_ploidy`](#--gatk_ploidy) - - [`--gatk_dbsnp`](#--gatk_dbsnp) - - [`--gatk_ug_out_mode`](#--gatk_ug_out_mode) - - [`--gatk_hc_out_mode`](#--gatk_hc_out_mode) - - [`--gatk_ug_genotype_model`](#--gatk_ug_genotype_model) - - [`--gatk_hc_emitrefconf`](#--gatk_hc_emitrefconf) - - [`--gatk_ug_keep_realign_bam`](#--gatk_ug_keep_realign_bam) - - [`--gatk_downsample`](#--gatk_downsample) - - [`--gatk_ug_gatk_ug_defaultbasequalities`](#--gatk_ug_gatk_ug_defaultbasequalities) - - [`--freebayes_C`](#--freebayes_c) - - [`--freebayes_g`](#--freebayes_g) - - [`--freebayes_p`](#--freebayes_p) - - [`--pileupcaller_bedfile`](#pileupcallerbedfile) - - [`--pileupcaller_snpfile`](#pileupcallersnpfile) - - [`--pileupcaller_method`](#pileupcallermethod) - [Consensus Sequence Generation](#consensus-sequence-generation) - - [`--run_vcf2genome`](#--run_vcf2genome) - - [`--vcf2genome_outfile`](#--vcf2genome_outfile) - - [`--vcf2genome_header`](#--vcf2genome_header) - - [`--vcf2genome_minc`](#--vcf2genome_minc) - - [`--vcf2genome_minq`](#--vcf2genome_minq) - - [`--vcf2genome_minfreq`](#--vcf2genome_minfreq) - [Mitochondrial to Nuclear Ratio](#mitochondrial-to-nuclear-ratio) - - [`--run_mtnucratio`](#--run_mtnucratio) - - [`--mtnucratio_header`](#--mtnucratio_header) - [SNP Table Generation](#snp-table-generation) - - [`--run_multivcfanalyzer`](#--run_multivcfanalyzer) - - [`--write_allele_frequencies`](#--write_allele_frequencies) - - [`--min_genotype_quality`](#--min_genotype_quality) - - [`--min_base_coverage`](#--min_base_coverage) - - [`--min_allele_freq_hom`](#--min_allele_freq_hom) - - [`--min_allele_freq_het`](#--min_allele_freq_het) - - [`--additional_vcf_files`](#--additional_vcf_files) - - [`--reference_gff_annotations`](#--reference_gff_annotations) - - [`--reference_gff_exclude`](#--reference_gff_exclude) - - [`--snp_eff_results`](#--snp_eff_results) - [Human Sex Determination](#human-sex-determination) - - [`--run_sexdeterrmine`](#--run_sexdeterrmine) - - [`--sexdeterrmine_bedfile`](#--sexdeterrmine_bedfile) - [Human Nuclear Contamination](#human-nuclear-contamination) - - [`--run_nuclear_contamination`](#--run_nuclear_contamination) - - [`--contamination_chrom_name`](#--contamination_chrom_name) - [Metagenomic Screening](#metagenomic-screening) - - [`--run_metagenomic_screening`](#--run_metagenomic_screening) - - [`--metagenomic_tool`](#--metagenomic_tool) - - [`--metagenomic_min_support_reads`](#--metagenomic_min_support_reads) - - [`--database`](#--database) - - [`--percent_identity`](#--percent_identity) - - [`--malt_mode`](#--malt_mode) - - [`--malt_alignment_mode`](#--malt_alignment_mode) - - [`--malt_top_percent`](#--malt_top_percent) - - [`--malt_min_support_mode`](#--malt_min_support_mode) - - [`--malt_min_support_percent`](#--malt_min_support_percent) - - [`--malt_max_queries`](#--malt_max_queries) - - [`--malt_memory_mode`](#--malt_memory_mode) - - [`--run_maltextract`](#--run_maltextract) - - [`--maltextract_taxon_list`](#--maltextract_taxon_list) - - [`--maltextract_ncbifiles`](#--maltextract_ncbifiles) - - [`--maltextract_filter`](#--maltextract_filter) - - [`--maltextract_toppercent`](#--maltextract_toppercent) - - [`--maltextract_destackingoff`](#--maltextract_destackingoff) - - [`--maltextract_downsamplingoff`](#--maltextract_downsamplingoff) - - [`--maltextract_duplicateremovaloff`](#--maltextract_duplicateremovaloff) - - [`--maltextract_matches`](#--maltextract_matches) - - [`--maltextract_megansummary`](#--maltextract_megansummary) - - [`--maltextract_percentidentity`](#--maltextract_percentidentity) - - [`--maltextract_topalignment`](#--maltextract_topalignment) - [Clean up](#clean-up) ## General Nextflow info @@ -729,13 +583,14 @@ This flag means that only merged reads are sent downstream for analysis. Singlet #### `--mapper` -Specify which mapping tool to use. Options are BWA aln (`'bwaaln'`), BWA mem (`'bwamem'`), circularmapper (`'circularmapper'`). bwa aln is the default and best for short read ancient DNA. bwa mem can be quite useful for modern DNA, but is rarely used in projects for ancient DNA. CircularMapper enhances the mapping procedure to circular references, using the BWA algorithm but utilizing a extend-remap procedure (see Peltzer et al 2016, Genome Biology for details). Default is 'bwaaln' +Specify which mapping tool to use. Options are BWA aln (`'bwaaln'`), BWA mem (`'bwamem'`), circularmapper (`'circularmapper'`), or bowtie2 (`bowtie2`). bwa aln is the default and highly suited for short read ancient DNA. bwa mem can be quite useful for modern DNA, but is rarely used in projects for ancient DNA. CircularMapper enhances the mapping procedure to circular references, using the BWA algorithm but utilizing a extend-remap procedure (see Peltzer et al 2016, Genome Biology for details). Bowtie2 is similar to bwa aln, and has recently been suggested to provide slightly better results under certain conditions [(Poullet and Orlando 2020)](https://doi.org/10.3389/fevo.2020.00105), as well as providing extra functionality (such as FASTQ trimming). Default is 'bwaaln' More documentation can be seen for each tool under: - [bwa aln](http://bio-bwa.sourceforge.net/bwa.shtml#3) - [bwa mem](http://bio-bwa.sourceforge.net/bwa.shtml#3) - [CircularMapper](https://circularmapper.readthedocs.io/en/latest/contents/userguide.html) +- [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line) #### BWA (default) @@ -769,6 +624,32 @@ The chromosome in your FastA reference that you'd like to be treated as circular If you want to filter out reads that don't map to a circular chromosome, turn this on. By default this option is turned off. +#### Bowtie2 + +##### `--bt2_alignmode` + +The type of read alignment to use. Options are 'local' or 'end-to-end'. Local allows only partial alignment of read, with ends of reads possibly 'soft-clipped' (i.e. remain unaligned/ignored), if the soft-clipped alignment provides best alignment score. End-to-end requires all nucleotides to be aligned. Default is 'local', following [Cahill et al (2018)](https://doi.org/10.1093/molbev/msy018) and [(Poullet and Orlando 2020)](https://doi.org/10.3389/fevo.2020.00105). + +##### `--bt2_sensitivity` + +The Bowtie2 'preset' to use. Options: 'no-preset' 'very-fast', 'fast', 'sensitive', or 'very-sensitive'. These strings apply to both `--bt2_alignmode` options. See the Bowtie2 [manual](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line) for actual settings. Default is 'sensitive' (following [Poullet and Orlando (2020)](https://doi.org/10.3389/fevo.2020.00105), when running damaged-data _without_ UDG treatment) + +##### `--bt2n` + +The number of mismatches allowed in the seed during seed-and-extend procedure of Bowtie2. This will override any values set with `--bt2_sensitivity`. Can either be 0 or 1. Default: 0 (i.e. use`--bt2_sensitivity` defaults). + +##### `--bt2l` + +The length of the seed substring to use during seeding. This will override any values set with `--bt2_sensitivity`. Default: 0 (i.e. use`--bt2_sensitivity` defaults: [20 for local and 22 for end-to-end](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line). + +##### `-bt2_trim5` + +Number of bases to trim of 5' (left) end of read prior alignment. Maybe useful when left-over sequencing artefacts of in-line barcodes present Default: 0 + +##### `-bt2_trim3` + +Number of bases to trim of 3' (right) end of read prior alignment. Maybe useful when left-over sequencing artefacts of in-line barcodes present Default: 0. + ### Mapped Reads Stripping These parameters are used for removing mapped reads from the original input FASTQ files, usually in the context of uploading the original FASTQ files to a public read archive (NCBI SRA/EBI ENA). diff --git a/main.nf b/main.nf index b3996ca90..44edf8e04 100644 --- a/main.nf +++ b/main.nf @@ -49,7 +49,8 @@ def helpMessage() { --run_convertinputbam Turns on convertion of an input BAM file into FASTQ format before pre-processing (e.g. AdapterRemoval etc.). References Optional additional pre-made indices, or you wish to overwrite any of the references. - --bwa_index Path and name of a bwa indexed FASTA reference file with index suffixes (i.e. everything before the endings '.amb' '.ann' '.bwt'. Most likely the same value as --fasta) + --bwa_index Path and name of a bwa indexed FASTA reference file without index suffixes (i.e. everything before the endings '.amb' '.ann' '.bwt'. Most likely the same value as --fasta) + --bt2_index Path and name of a bowtie2 indexed FASTA reference file without index index suffixes (i.e. everything before the endings e.g. '.1.bt2', '.2.bt2', '.rev.1.bt2'. Most likely the same value as --fasta) --bedfile Path to BED file for SNPCapture methods. --seq_dict Path to picard sequence dictionary file (typically ending in '.dict'). --fasta_index Path to samtools FASTA index (typically ending in '.fai'). @@ -79,14 +80,20 @@ def helpMessage() { --mergedonly Turn on sending downstream only merged reads (un-merged reads and singletons are discarded). Mapping - --mapper Specify which mapper to use. Options: 'bwaaln', 'bwamem', 'circularmapper'. Default: '${params.mapper}' + --mapper Specify which mapper to use. Options: 'bwaaln', 'bwamem', 'circularmapper', 'bowtie2'. Default: '${params.mapper}' --bwaalnn Specify the -n parameter for BWA aln. Default: ${params.bwaalnn} --bwaalnk Specify the -k parameter for BWA aln. Default: ${params.bwaalnk} --bwaalnl Specify the -l parameter for BWA aln. Default: ${params.bwaalnl} --circularextension Specify the number of bases to extend reference by (circularmapper only). Default: ${params.circularextension} --circulartarget Specify the target chromosome for CM (circularmapper only). Default: '${params.circulartarget}' --circularfilter Turn on to filter off-target reads (circularmapper only). - + --bt2_alignmode Specify the bowtie2 alignment mode. Options: 'local', 'end-to-end'. Default: '${params.bt2_alignmode}' + --bt2_sensitivity Specify the level of sensitivity for the bowtie2 alignment mode. Options: 'no-preset', 'very-fast', 'fast', 'sensitive', 'very-sensitive'. Default: '${params.bt2_sensitivity}' + --bt2n Specify the -N parameter for bowtie2 (mismatches in seed). This will override defaults from alignmode/sensitivity. Default: ${params.bt2n} + --bt2l Specify the -L parameter for bowtie2 (length of seed substrings). Default: ${params.bt2l} + --bt2_trim5 Specify number of bases to trim off from 5' (left) end of read before alignment. Default: ${params.bt2_trim5} + --bt2_trim3 Specify number of bases to trim off from 3' (right) end of read before alignment. Default: ${params.bt2_trim3} + Stripping --strip_input_fastq Turn on creating pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data) --strip_mode Stripping mode. Remove mapped reads completely from FASTQ (strip) or just mask mapped reads sequence by N (replace). Default: '${params.strip_mode}' @@ -214,7 +221,7 @@ def helpMessage() { --max_cpus Maximum number of CPUs to use for each step of the pipeline. Should be in form e.g. Default: '${params.max_cpus}' --email Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits --plaintext_email Receive plain text emails rather than HTML - --max_multiqc_email_size Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) + --max_multiqc_email_size Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) For a full list and more information of available parameters, consider the documentation (https://github.com/nf-core/eager/). """.stripIndent() @@ -256,7 +263,7 @@ if ( params.fasta.isEmpty () ){ file zipped_fasta output: - file "*.{fa,fn,fna,fasta}" into ch_fasta_for_bwaindex,ch_fasta_for_faidx,ch_fasta_for_seqdict,ch_fasta_for_circulargenerator,ch_fasta_for_circularmapper,ch_fasta_for_damageprofiler,ch_fasta_for_qualimap,ch_fasta_for_pmdtools,ch_fasta_for_genotyping_ug,ch_fasta_for_genotyping_hc,ch_fasta_for_genotyping_freebayes,ch_fasta_for_genotyping_pileupcaller,ch_fasta_for_vcf2genome,ch_fasta_for_multivcfanalyzer + file "*.{fa,fn,fna,fasta}" into ch_fasta_for_bwaindex,ch_fasta_for_bt2index,ch_fasta_for_faidx,ch_fasta_for_seqdict,ch_fasta_for_circulargenerator,ch_fasta_for_circularmapper,ch_fasta_for_damageprofiler,ch_fasta_for_qualimap,ch_fasta_for_pmdtools,ch_fasta_for_genotyping_ug,ch_fasta_for_genotyping_hc,ch_fasta_for_genotyping_freebayes,ch_fasta_for_genotyping_pileupcaller,ch_fasta_for_vcf2genome,ch_fasta_for_multivcfanalyzer script: rm_zip = zipped_fasta - '.gz' @@ -268,10 +275,11 @@ if ( params.fasta.isEmpty () ){ } else { fasta_for_indexing = Channel .fromPath("${params.fasta}", checkIfExists: true) - .into{ ch_fasta_for_bwaindex; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_genotyping_pileupcaller; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer } + .into{ ch_fasta_for_bwaindex; ch_fasta_for_bt2index; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_genotyping_pileupcaller; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer } lastPath = params.fasta.lastIndexOf(File.separator) bwa_base = params.fasta.substring(lastPath+1) + bt2_base = params.fasta.substring(lastPath+1) } // Check that fasta index file path ends in '.fai' @@ -285,8 +293,21 @@ if (params.genomes && params.genome && !params.genomes.containsKey(params.genome } // Mapper sanity checking -if (params.mapper != 'bwaaln' && !params.mapper == 'circularmapper' && !params.mapper == 'bwamem'){ - exit 1, "[nf-core/eager] error: invalid mapper option. Options are: 'bwaaln', 'bwamem', 'circularmapper'. Default: 'bwaaln'. You gave: ${params.mapper}!" +if (params.mapper != 'bwaaln' && !params.mapper == 'circularmapper' && !params.mapper == 'bwamem' && !params.mapper == "bowtie2"){ + exit 1, "[nf-core/eager] error: invalid mapper option. Options are: 'bwaaln', 'bwamem', 'circularmapper', 'bowtie2'. Default: 'bwaaln'. You gave: ${params.mapper}!" +} + +if (params.mapper == 'bowtie2' && params.bt2_alignmode != 'local' && params.bt2_alignmode != 'end-to-end' ) { + exit 1, "[nf-core/eager] error: invalid bowtie2 alignment mode. Options: 'local', 'end-to-end'. You gave: ${params.bt2_alignmode}" +} + +if (params.mapper == 'bowtie2' && params.bt2_sensitivity != 'no-preset' && params.bt2_sensitivity != 'very-fast' && params.bt2_sensitivity != 'fast' && params.bt2_sensitivity != 'sensitive' && params.bt2_sensitivity != 'very-sensitive' ) { + exit 1, "[nf-core/eager] error: invalid bowtie2 sensitivity mode. Options: 'no-preset', 'very-fast', 'fast', 'sensitive', 'very-sensitive'. Options are for both alignmodes You gave: ${params.bt2_sensitivity}" +} + +if (params.bt2n != 0 && params.bt2n != 1) { + exit 1, "[nf-core/eager] error: invalid bowtie2 --bt2n (-N) parameter. Options: 0, 1. You gave: ${params.bt2n}" + } // Index files provided? Then check whether they are correct and complete @@ -297,8 +318,23 @@ if( params.bwa_index != '' && (params.mapper == 'bwaaln' | params.mapper == 'bwa Channel .fromPath(bwa_dir, checkIfExists: true) - .ifEmpty { exit 1, "[nf-core/eager] error: bwa index directory not found: ${bwa_dir}" } + .ifEmpty { exit 1, "[nf-core/eager] error: bwa indicies not found in: ${bwa_dir}" } .into {bwa_index; bwa_index_bwamem} + + bt2_index = '' +} + +if( params.bt2_index != '' && params.mapper == 'bowtie2' ){ + lastPath = params.bt2_index.lastIndexOf(File.separator) + bt2_dir = params.bt2_index.substring(0,lastPath+1) + bt2_base = params.bt2_index.substring(lastPath+1) + + Channel + .fromPath(bt2_dir, checkIfExists: true) + .ifEmpty { exit 1, "[nf-core/eager] error: bowtie2 indicies not found in: ${bt2_dir}" } + .into {bt2_index; bt2_index_bwamem} + + bwa_index = '' } // Validate BAM input isn't set to paired_end @@ -606,7 +642,7 @@ summary['Input'] = params.input summary['Convert input BAM?'] = params.run_convertinputbam ? 'Yes' : 'No' summary['Fasta Ref'] = params.fasta summary['BAM Index Type'] = (params.large_ref == "") ? 'BAI' : 'CSI' -if(params.bwa_index) summary['BWA Index'] = params.bwa_index +if(params.bwa_index || params.bt2_index ) summary['BWA Index'] = "Yes" summary['Skipping FASTQC?'] = params.skip_fastqc ? 'Yes' : 'No' summary['Skipping AdapterRemoval?'] = params.skip_adapterremoval ? 'Yes' : 'No' if (!params.skip_adapterremoval) { @@ -697,8 +733,8 @@ Channel.from(summary.collect{ [it.key, it.value] }) /////////////////////////////////////////////////// // BWA Index -if(params.bwa_index == '' && !params.fasta.isEmpty() && (params.mapper == 'bwaaln' || params.mapper == 'bwamem' || params.mapper == 'circularmapper')){ -process makeBWAIndex { +if( params.bwa_index == '' && !params.fasta.isEmpty() && (params.mapper == 'bwaaln' || params.mapper == 'bwamem' || params.mapper == 'circularmapper')){ + process makeBWAIndex { label 'sc_medium' tag "${fasta}" publishDir path: "${params.outdir}/reference_genome/bwa_index", mode: 'copy', saveAs: { filename -> @@ -721,6 +757,38 @@ process makeBWAIndex { mkdir BWAIndex && mv ${fasta}* BWAIndex """ } + bt2_index = 'none' +} + +// bowtie2 Index +if(params.bt2_index == '' && !params.fasta.isEmpty() && params.mapper == "bowtie2"){ + process makeBT2Index { + label 'sc_medium' + tag "${fasta}" + publishDir path: "${params.outdir}/reference_genome/bt2_index", mode: 'copy', saveAs: { filename -> + if (params.save_reference) filename + else if(!params.save_reference && filename == "where_are_my_files.txt") filename + else null + } + + input: + file fasta from ch_fasta_for_bt2index + file where_are_my_files + + output: + file "BT2Index" into (bt2_index) + file "where_are_my_files.txt" + + script: + """ + bowtie2-build $fasta $fasta + mkdir BT2Index && mv ${fasta}* BT2Index + """ + } + + bwa_index = 'none' + bwa_index_bwamem = 'none' + } // FASTA Index (FAI) @@ -1151,7 +1219,7 @@ ch_lanemerge_for_mapping } .mix(ch_branched_for_lanemerge.skip_merge) - .into { ch_lanemerge_for_skipmap; ch_lanemerge_for_bwa; ch_lanemerge_for_cm; ch_lanemerge_for_bwamem } + .into { ch_lanemerge_for_skipmap; ch_lanemerge_for_bwa; ch_lanemerge_for_cm; ch_lanemerge_for_bwamem; ch_lanemerge_for_bt2 } // ENA upload doesn't do separate lanes, so merge raw FASTQs for mapped-reads stripping @@ -1224,8 +1292,6 @@ process bwa { tag "${libraryid}" publishDir "${params.outdir}/mapping/bwa", mode: 'copy' - when: params.mapper == 'bwaaln' - input: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(r1), file(r2) from ch_lanemerge_for_bwa file index from bwa_index.collect() @@ -1233,6 +1299,9 @@ process bwa { output: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.mapped.bam"), file("*.{bai,csi}") into ch_output_from_bwa + when: + params.mapper == 'bwaaln' + script: size = "${params.large_ref}" ? '-c' : '' fasta = "${index}/${bwa_base}" @@ -1263,15 +1332,16 @@ process bwamem { tag "$libraryid" publishDir "${params.outdir}/mapping/bwamem", mode: 'copy' - when: params.mapper == 'bwamem' - input: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(r1), file(r2) from ch_lanemerge_for_bwamem file index from bwa_index_bwamem.collect() output: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.mapped.bam"), file("*.{bai,csi}") into ch_output_from_bwamem - + + when: + params.mapper == 'bwamem' + script: fasta = "${index}/${bwa_base}" size = "${params.large_ref}" ? '-c' : '' @@ -1301,14 +1371,15 @@ process circulargenerator{ else null } - when: params.mapper == 'circularmapper' - input: file fasta from ch_fasta_for_circulargenerator output: file "${prefix}.{amb,ann,bwt,sa,pac}" into ch_circularmapper_indices + when: + params.mapper == 'circularmapper' + script: prefix = "${fasta.baseName}_${params.circularextension}.fasta" """ @@ -1323,8 +1394,6 @@ process circularmapper{ tag "$libraryid" publishDir "${params.outdir}/mapping/circularmapper", mode: 'copy' - when: params.mapper == 'circularmapper' - input: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(r1), file(r2) from ch_lanemerge_for_cm file index from ch_circularmapper_indices.collect() @@ -1332,7 +1401,10 @@ process circularmapper{ output: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.mapped.bam"), file("*.{bai,csi}") into ch_output_from_cm, ch_outputindex_from_cm - + + when: + params.mapper == 'circularmapper' + script: filter = "${params.circularfilter}" ? '' : '-f true -x false' elongated_root = "${fasta.baseName}_${params.circularextension}.fasta" @@ -1360,8 +1432,81 @@ process circularmapper{ } +process bowtie2 { + label 'mc_medium' + tag "${libraryid}" + publishDir "${params.outdir}/mapping/bt2", mode: 'copy' + + input: + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(r1), file(r2) from ch_lanemerge_for_bt2 + file index from bt2_index.collect() + + output: + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.mapped.bam"), file("*.{bai,csi}") into ch_output_from_bt2 + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*_bt2.log") into ch_bt2_for_multiqc + + when: + params.mapper == 'bowtie2' + + script: + size = "${params.large_ref}" ? '-c' : '' + fasta = "${index}/${bt2_base}" + trim5 = "${params.bt2_trim5}" != 0 ? "--trim5 ${params.bt2_trim5}" : "" + trim3 = "${params.bt2_trim3}" != 0 ? "--trim3 ${params.bt2_trim3}" : "" + bt2n = "${params.bt2n}" != 0 ? "-N ${params.bt2n}" : "" + bt2l = "${params.bt2l}" != 0 ? "-L ${params.bt2l}" : "" + + if ( "${params.bt2_alignmode}" == "end-to-end" ) { + switch ( "${params.bt2_sensitivity}" ) { + case "no-preset": + sensitivity = ""; break + case "very-fast": + sensitivity = "--very-fast"; break + case "fast": + sensitivity = "--fast"; break + case "sensitive": + sensitivity = "--sensitive"; break + case "very-sensitive": + sensitivity = "--very-sensitive"; break + default: + sensitivity = ""; break + } + } else if ("${params.bt2_alignmode}" == "local") { + switch ( "${params.bt2_sensitivity}" ) { + case "no-preset": + sensitivity = ""; break + case "very-fast": + sensitivity = "--very-fast-local"; break + case "fast": + sensitivity = "--fast-local"; break + case "sensitive": + sensitivity = "--sensitive-local"; break + case "very-sensitive": + sensitivity = "--very-sensitive-local"; break + default: + sensitivity = ""; break + + } + } + + //PE data without merging, PE data without any AR applied + if ( seqtype == 'PE' && ( params.skip_collapse || params.skip_adapterremoval ) ){ + """ + bowtie2 -x ${fasta} -1 ${r1} -2 ${r2} -p ${task.cpus} ${sensitivity} ${bt2n} ${bt2l} ${trim5} ${trim3} 2> "${libraryid}"_bt2.log | samtools sort -@ ${task.cpus} -O bam > "${libraryid}".mapped.bam + samtools index "${size}" "${libraryid}".mapped.bam + """ + } else { + //PE collapsed, or SE data + """ + bowtie2 -x ${fasta} -U ${r1} -p ${task.cpus} ${sensitivity} ${bt2n} ${bt2l} ${trim5} ${trim3} 2> "${libraryid}"_bt2.log | samtools sort -@ ${task.cpus} -O bam > "${libraryid}".mapped.bam + samtools index "${size}" "${libraryid}".mapped.bam + """ + } + +} + // Gather all mapped BAMs from all possible mappers into common channels to send downstream -ch_output_from_bwa.mix(ch_output_from_bwamem, ch_output_from_cm, ch_indexbam_for_filtering) +ch_output_from_bwa.mix(ch_output_from_bwamem, ch_output_from_cm, ch_indexbam_for_filtering, ch_output_from_bt2) .into { ch_mapping_for_skipfiltering; ch_mapping_for_filtering; ch_mapping_for_samtools_flagstat } // Post-mapping QC @@ -2632,7 +2777,8 @@ process get_software_versions { circulargenerator --help | head -n 1 &> v_circulargenerator.txt 2>&1 || true samtools --version &> v_samtools.txt 2>&1 || true dedup -v &> v_dedup.txt 2>&1 || true - picard MarkDuplicates --version &> v_markduplicates.txt 2>&1 || true + ## bioconda recipe of picard is incorrectly set up and extra warning made with stderr, this ugly command ensures only version exported + ( exec 7>&1; picard MarkDuplicates --version 2>&1 >&7 | grep -v '/' >&2 ) 2> v_markduplicates.txt || true qualimap --version &> v_qualimap.txt 2>&1 || true preseq &> v_preseq.txt 2>&1 || true gatk --version 2>&1 | head -n 1 > v_gatk.txt 2>&1 || true @@ -2652,6 +2798,7 @@ process get_software_versions { kraken2 --version | head -n 1 &> v_kraken.txt || true endorS.py --version &> v_endorSpy.txt || true pileupCaller --version &> v_sequencetools.txt 2>&1 || true + bowtie2 --version | grep -a 'bowtie2-.* -fdebug' > v_bowtie2.txt || true scrape_software_versions.py &> software_versions_mqc.yaml """ @@ -2671,6 +2818,7 @@ process multiqc { file('fastqc/*') from ch_fastqc_after_clipping.collect().ifEmpty([]) file software_versions_mqc from software_versions_yaml.collect().ifEmpty([]) file ('adapter_removal/*') from ch_adapterremoval_logs.collect().ifEmpty([]) + file ('mapping/bt2/*') from ch_bt2_for_multiqc.collect().ifEmpty([]) file ('flagstat/*') from ch_flagstat_for_multiqc.collect().ifEmpty([]) file ('flagstat_filtered/*') from ch_bam_filtered_flagstat_for_multiqc.collect().ifEmpty([]) file ('preseq/*') from ch_preseq_for_multiqc.collect().ifEmpty([]) diff --git a/nextflow.config b/nextflow.config index d6d6a6548..bfddb3884 100644 --- a/nextflow.config +++ b/nextflow.config @@ -23,6 +23,7 @@ params { genome = false fasta_index = '' bwa_index = '' + bt2_index = '' //Auxillary input data info and files snpcapture = false @@ -66,10 +67,16 @@ params { mapper = 'bwaaln' bwaalnn = 0.04 bwaalnk = 2 - bwaalnl = 1024 + bwaalnl = 1024 // From Schubert et al. 2012 (10.1186/1471-2164-13-178) circularextension = 500 circulartarget = 'MT' circularfilter = false + bt2_alignmode = 'local' // from Cahill 2018 (10.1093/molbev/msy018) and, Poullet and Orlando (10.3389/fevo.2020.00105) + bt2_sensitivity = 'sensitive' // from Poullet and Orlando (10.3389/fevo.2020.00105) + bt2_trim5 = 0 + bt2_trim3 = 0 + bt2n = 0 // Do not set Cahill 2018 recommendation of 1 here, so not to 'hide' overriding bowtie2 presets + bt2l = 0 //BAM Filtering steps (default = keep mapped and unmapped in BAM file) run_bam_filtering = false