diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8977cd31e..276f8b6d0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -37,13 +37,13 @@ jobs: - name: Build new docker image if: env.MATCHED_FILES - run: docker build --no-cache . -t nfcore/eager:2.4.4 + run: docker build --no-cache . -t nfcore/eager:2.4.5 - name: Pull docker image if: ${{ !env.MATCHED_FILES }} run: | docker pull nfcore/eager:dev - docker tag nfcore/eager:dev nfcore/eager:2.4.4 + docker tag nfcore/eager:dev nfcore/eager:2.4.5 - name: Install Nextflow env: diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 83a8bc100..47d45298a 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -12,9 +12,8 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - uses: actions/setup-node@v1 - with: - node-version: '10' + - uses: actions/setup-node@v2 + - name: Install markdownlint run: npm install -g markdownlint-cli - name: Run Markdownlint @@ -46,18 +45,16 @@ jobs: repo-token: ${{ secrets.GITHUB_TOKEN }} allow-repeats: false - YAML: runs-on: ubuntu-latest steps: - uses: actions/checkout@v1 - - uses: actions/setup-node@v1 - with: - node-version: '10' + - uses: actions/setup-node@v2 + - name: Install yaml-lint run: npm install -g yaml-lint - name: Run yaml-lint - run: yamllint $(find ${GITHUB_WORKSPACE} -type f -name "*.yml" -o -name "*.yaml") + run: yamllint $(find ${GITHUB_WORKSPACE} -type f -name "*.yml" -o -name "*.yaml") -c .github/yamllint.yml # If the above check failed, post a comment on the PR explaining the failure - name: Post PR comment @@ -84,11 +81,9 @@ jobs: repo-token: ${{ secrets.GITHUB_TOKEN }} allow-repeats: false - nf-core: runs-on: ubuntu-latest steps: - - name: Check out pipeline code uses: actions/checkout@v2 @@ -101,8 +96,8 @@ jobs: - uses: actions/setup-python@v1 with: - python-version: '3.6' - architecture: 'x64' + python-version: "3.6" + architecture: "x64" - name: Install dependencies run: | @@ -129,4 +124,3 @@ jobs: lint_log.txt lint_results.md PR_number.txt - diff --git a/.github/yamllint.yml b/.github/yamllint.yml new file mode 100644 index 000000000..35ebcb16c --- /dev/null +++ b/.github/yamllint.yml @@ -0,0 +1,7 @@ +rules: + document-start: disable + comments: disable + truthy: disable + line-length: disable + empty-lines: disable + diff --git a/.gitpod.yml b/.gitpod.yml new file mode 100644 index 000000000..e034a61d5 --- /dev/null +++ b/.gitpod.yml @@ -0,0 +1,14 @@ +image: nfcore/gitpod:latest + +vscode: + extensions: # based on nf-core.nf-core-extensionpack + - codezombiech.gitignore # Language support for .gitignore files + # - cssho.vscode-svgviewer # SVG viewer + - esbenp.prettier-vscode # Markdown/CommonMark linting and style checking for Visual Studio Code + - eamodio.gitlens # Quickly glimpse into whom, why, and when a line or code block was changed + - EditorConfig.EditorConfig # override user/workspace settings with settings found in .editorconfig files + - Gruntfuggly.todo-tree # Display TODO and FIXME in a tree view in the activity bar + - mechatroner.rainbow-csv # Highlight columns in csv files in different colors + # - nextflow.nextflow # Nextflow syntax highlighting + - oderwat.indent-rainbow # Highlight indentation level + - streetsidesoftware.code-spell-checker # Spelling checker for source code diff --git a/.nf-core-lint.yml b/.nf-core-lint.yml index 496fea360..a39889277 100644 --- a/.nf-core-lint.yml +++ b/.nf-core-lint.yml @@ -3,4 +3,4 @@ files_unchanged: - .github/CONTRIBUTING.md - .github/ISSUE_TEMPLATE/bug_report.md - docs/README.md - + - .github/workflows/linting.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index d31df3f8b..04f1f7208 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,32 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## [2.4.5] - 2022-07-39 + +### `Added` + +### `Fixed` + +- [#882](https://github.com/nf-core/eager/pull/882) Define DSL1 execution explicitly, as new versions Nextflow made DSL2 default (♥ to & fix from @Lehmann-Fabian) +- [#879](https://github.com/nf-core/eager/issues/879) Add missing threads parameter for pre-clipping FastQC for single end data that caused insufficient memory in some cases (♥ to @marcel-keller for reporting) +- [#880](https://github.com/nf-core/eager/issues/880) Fix failure of endorSpy to be cached or reexecuted on resume (♥ to @KathrinNaegele, @TCLamnidis, & @mahesh-panchal for reporting and debugging) +- [#885](https://github.com/nf-core/eager/issues/885) Specify task memory for all tools in get_software_versions to account for incompatibilty of java with some SGE clusters causing hanging of the process (♥ to @maxibor for reporting) +- [#887](https://github.com/nf-core/eager/issues/887) Clarify what is considered 'ultra-short' reads in the help text of clip_readlength, for when you may wish to turn of length filtering during AdapterRemoval (♥ to @TCLamnidis for reporting) +- [#889](https://github.com/nf-core/eager/issues/889) Remove/update parameters from benchmarking test profiles (♥ to @TCLamnidis for reporting) +- [#895](https://github.com/nf-core/eager/issues/895) Output documentation typo fix and added location of output docs in pipeline summary (♥ to @RodrigoBarquera for reporting) +- [#897](https://github.com/nf-core/eager/issues/897) Fix pipeline crash if no Kraken2 results generated (♥ to @alexandregilardet for reporting) +- [#899](https://github.com/nf-core/eager/issues/897) Fix pipeline crash for circulargenerator if reference file does not end in .fasta (♥ to @scarlhoff for reporting) +- Fixed some missing default values in the nextflow parameter schema JSON +- [#789](https://github.com/nf-core/eager/issues/789) Substantial speed and memory optimisation of the `extract_map_reads.py` script (♥ to @ivelsko for reporting, @maxibor for optimisation) +- Fix staging of input bams for genotyping_pileupcaller process. Downstream changes from changes introduced when fixing endorspy caching. +- Made slight correction on metro map diagram regarding input data to SexDeterrmine (only BAM trimming output files) + +### `Dependencies` + +- Updated MultiQC to latest stable alpha version on bioconda, correcting the previously nonsensical AdapterRemoval plots (♥ to @NiemannJ for fixing in MultiQC) + +### `Deprecated` + ## [2.4.4] - 2022-04-08 ### `Added` diff --git a/Dockerfile b/Dockerfile index 30ec3701e..d5bae974c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,7 +7,7 @@ COPY environment.yml / RUN conda env create --quiet -f /environment.yml && conda clean -a # Add conda installation dir to PATH (instead of doing 'conda activate') -ENV PATH /opt/conda/envs/nf-core-eager-2.4.4/bin:$PATH +ENV PATH /opt/conda/envs/nf-core-eager-2.4.5/bin:$PATH # Dump the details of the installed packages to a file for posterity -RUN conda env export --name nf-core-eager-2.4.4 > nf-core-eager-2.4.4.yml \ No newline at end of file +RUN conda env export --name nf-core-eager-2.4.5 > nf-core-eager-2.4.5.yml \ No newline at end of file diff --git a/bin/extract_map_reads.py b/bin/extract_map_reads.py index fac2f9315..7557e3f19 100755 --- a/bin/extract_map_reads.py +++ b/bin/extract_map_reads.py @@ -1,257 +1,145 @@ #!/usr/bin/env python3 -# Written by Maxime Borry and released under the MIT license. +# Written by Maxime Borry and released under the MIT license. # See git repository (https://github.com/nf-core/eager) for full license text. import argparse -import multiprocessing import pysam from xopen import xopen -from functools import partial -import sys -from Bio.SeqIO.QualityIO import FastqGeneralIterator -from io import StringIO +import logging +import os +from pathlib import Path def _get_args(): - '''This function parses and return arguments passed in''' + """This function parses and return arguments passed in""" parser = argparse.ArgumentParser( - prog='extract_mapped_reads', + prog="extract_mapped_reads", formatter_class=argparse.RawDescriptionHelpFormatter, - description="Remove mapped in bam file from fastq files") - parser.add_argument('bam_file', help="path to bam file") - parser.add_argument('fwd', help='path to forward fastq file') + description="Remove mapped in bam file from fastq files", + ) + parser.add_argument("bam_file", help="path to bam file") + parser.add_argument("fwd", help="path to forward fastq file") parser.add_argument( - '-rev', - dest="rev", - default=None, - help="path to reverse fastq file") + "-merged", + dest="merged", + default=False, + action="store_true", + help="specify if bam file was created from merged fastq files", + ) parser.add_argument( - '-of', - dest="out_fwd", - default=None, - help="path to forward output fastq file") + "-rev", dest="rev", default=None, help="path to reverse fastq file" + ) + parser.add_argument( + "-of", dest="out_fwd", default=None, help="path to forward output fastq file" + ) parser.add_argument( - '-or', - dest="out_rev", - default=None, - help="path to forward output fastq file") + "-or", dest="out_rev", default=None, help="path to forward output fastq file" + ) parser.add_argument( - '-m', - dest='mode', - default='remove', - help='Read removal mode: remove reads (remove) or replace sequence by N (replace)' + "-m", + dest="mode", + default="remove", + help="Read removal mode: remove reads (remove) or replace sequence by N (replace). Default = remove", ) parser.add_argument( - '-p', - dest='process', - default=4, - help='Number of parallel processes' + "-t", dest="threads", default=4, help="Number of parallel threads" ) args = parser.parse_args() bam = args.bam_file in_fwd = args.fwd + merged = args.merged in_rev = args.rev out_fwd = args.out_fwd out_rev = args.out_rev mode = args.mode - proc = int(args.process) - - return(bam, in_fwd, in_rev, out_fwd, out_rev, mode, proc) - + threads = int(args.threads) -def extract_mapped_chr(chr): - """ - Get mapped reads per chromosome - Args: - chr(str): chromosome - bam(str): bamfile path - Returns: - res(list): list of mapped reads (str) name per chromosome - """ - res = [] - reads = BAMFILE.fetch(chr, multiple_iterators=True) - for read in reads: - if read.is_unmapped == False: - if read.query_name.startswith("M_"): - read_name = read.query_name.replace( - "M_", "").split()[0].split("/")[0] - else: - read_name = read.query_name.split()[0].split("/")[0] - res.append(read_name) - return(res) + return (bam, in_fwd, merged, in_rev, out_fwd, out_rev, mode, threads) -def extract_mapped(proc): +def extract_mapped(bamfile, merged): """Get mapped reads in parallel - Returns: - bamfile(str): path to bam alignment file - result(list): list of mapped reads name (str) - """ - try: - chrs = BAMFILE.references - except ValueError as e: - print(e) - - # Returns empty list if not reads mapped (because not ref match in bam) - if len(chrs) == 0: - return([]) - - # Checking that nb_process is not > nb_chromosomes - proc = min(proc, len(chrs)) - with multiprocessing.Pool(proc) as p: - res = p.map(extract_mapped_chr, chrs) - result = [i for ares in res for i in ares if len(i) > 0] - return(result) - - -def parse_fq(fq): - """Parse a FASTQ file Args: - fq(str): path to fastq file + threads(int): number of threads to use + bam(str): path to bamfile Returns: - fqd(dict): dictionary with read names as keys, seq and quality as values - in a list + bamfile(str): path to bam alignment file + result(set): list of mapped reads name (str) """ - def get_fq_reads(allreads): - read_dict = {} - for title, seq, qual in FastqGeneralIterator(allreads): - # NEED TO ONLY KEEP THE FIRST PART OF THE FASTQ READ NAME FOR CROSS - # REFERENCING WITH BAM FILE: ONLY THIS INFORMATION IS KEPT WHEN - # COLLAPSING READS WITH ADAPTERREMOVAL - - # Until fastq format 1.8 - # Split after slash - # @HWUSI-EAS100R:6:73:941:1973#0/1 - suf_title = "" - title_space = title.split() - if len(title_space) > 1: - title = title_space[0] - suf_title = f" {title_space[1]}" - - # From fastq format 1.8 - # Split after space - # @EAS139:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG - title_slash = title.split("/") - if len(title_slash) > 1: - title = title_slash[0] - suf_title = f"/{title_slash[1]}" - - read_dict[title] = [suf_title, seq, "+", qual] - return(read_dict) - - if fq.endswith('.gz'): - with xopen(fq, 'r') as allreads: - fqd = get_fq_reads(allreads) + if bamfile.endswith(".bam") or bamfile.endswith(".gz"): + read_mode = "rb" else: - with open(fq, 'r') as allreads: - fqd = get_fq_reads(allreads) - - return(fqd) + read_mode = "r" + mapped_reads = set() + bamfile = pysam.AlignmentFile(bamfile, mode=read_mode) + for read in bamfile.fetch(): + if read.flag != 4: + if merged: + if read.query_name.startswith("M_"): + mapped_reads.add(read.query_name[2:]) + elif read.query_name.startswith("MT_"): + mapped_reads.add(read.query_name[3:]) + else: + mapped_reads.add(read.query_name) + else: + mapped_reads.add(read.query_name) + return mapped_reads -def get_mapped_reads(fq_dict, mapped_reads): - """Sort mapped reads from dictionary of fastq reads - Args: - fq_dict(dict) dictionary with read names as keys, seq and quality as values - in a list - mapped_reads(list) list of mapped reads - Returns: - fqd(dict) dictionary with read names as key, unmapped/mapped (u|m), - seq and quality as values in a list +def read_write_fq(fq_in, fq_out, mapped_reads, mode, write_mode, proc): """ - - def intersection(list1, list2): - return(set(list1).intersection(list2)) - - def difference(list1, list2): - return(set(list1).difference(list2)) - - fqd = {} - all_reads = list(fq_dict.keys()) - mapped = intersection(all_reads, mapped_reads) - unmapped = difference(all_reads, mapped_reads) - - for rm in mapped: - fqd[rm] = ['m']+fq_dict[rm] - for ru in unmapped: - fqd[ru] = ['u']+fq_dict[ru] - - return(fqd) - - -def write_fq(fq_dict, fname, write_mode, remove_mode, proc): - """Write to fastq file + Read and write fastq file with mapped reads removed Args: - fq_dict(dict): fq_dict with unmapped read names as keys, - unmapped/mapped (u|m), seq, and quality as values in a list - fname(string) Path to output fastq file - write_mode (str): 'rb' or 'r' - remove_mode (str): remove (remove read) or replace (replace read sequence) by Ns - proc(int) number of processes + fq_in(str): path to input fastq file + fq_out(str): path to output fastq file + mapped_reads(set): set of mapped reads name (str) + mode(str): read removal mode (remove or replace) + write_mode(str): write mode (w or wb) + proc(int): number of parallel processes + merged(bool): True if bam file was created from merged fastq files """ - fq_dict_keys = list(fq_dict.keys()) - if write_mode == 'wb': - with xopen(fname, mode='wb', threads=proc) as fw: - for fq_dict_key in fq_dict_keys: - wstring = "" - if remove_mode == 'remove': - if fq_dict[fq_dict_key][0] == 'u': - wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n" - for i in fq_dict[fq_dict_key][2:]: - wstring += f"{i}\n" - elif remove_mode == 'replace': - # if unmapped, write all the read lines - if fq_dict[fq_dict_key][0] == 'u': - wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n" - for i in fq_dict[fq_dict_key][2:]: - wstring += f"{i}\n" - # if mapped, write all the read lines, but replace sequence - # by N*(len(sequence)) - elif fq_dict[fq_dict_key][0] == 'm': - wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n" - wstring += f"{'N'*len(fq_dict[fq_dict_key][2])}\n" - for i in fq_dict[fq_dict_key][3:]: - wstring += f"{i}\n" - fw.write(wstring.encode()) - else: - with open(fname, 'w') as fw: - for fq_dict_key in fq_dict_keys: - wstring = "" - if remove_mode == 'remove': - if fq_dict[fq_dict_key][0] == 'u': - wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n" - for i in fq_dict[fq_dict_key][2:]: - wstring += f"{i}\n" - elif remove_mode == 'replace': - # if unmapped, write all the read lines - if fq_dict[fq_dict_key][0] == 'u': - wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n" - for i in fq_dict[fq_dict_key][2:]: - wstring += f"{i}\n" - # if mapped, write all the read lines, but replace sequence - # by N*(len(sequence)) - elif fq_dict[fq_dict_key][0] == 'm': - wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n" - wstring += f"{'N'*len(fq_dict[fq_dict_key][2])}\n" - for i in fq_dict[fq_dict_key][3:]: - wstring += f"{i}\n" - fw.write(wstring) - + if write_mode == "w": + cm = open(fq_out, write_mode) + elif write_mode == "wb": + cm = xopen(fq_out, mode=write_mode, threads=proc) + with pysam.FastxFile(fq_in) as fh: + with cm as fh_out: + for read in fh: + try: + if read.name in mapped_reads: + if mode == "replace": + read.sequence = "N" * len(read.sequence) + read = str(read) + "\n" + if write_mode == "w": + fh_out.write(read) + elif write_mode == "wb": + fh_out.write(read.encode()) + else: + read = str(read) + "\n" + if write_mode == "w": + fh_out.write(read) + elif write_mode == "wb": + fh_out.write(read.encode()) + except Exception as e: + logging.error(f"Problem with {str(read)}") + logging.error(e) def check_remove_mode(mode): - if mode.lower() not in ['replace', 'remove']: - print(f"Mode must be {' or '.join(mode)}") - return(mode.lower()) + if mode.lower() not in ["replace", "remove"]: + logging.info(f"Mode must be {' or '.join(mode)}") + return mode.lower() if __name__ == "__main__": - BAM, IN_FWD, IN_REV, OUT_FWD, OUT_REV, MODE, PROC = _get_args() + BAM, IN_FWD, MERGED, IN_REV, OUT_FWD, OUT_REV, MODE, PROC = _get_args() + + logging.basicConfig(level=logging.INFO, format="%(message)s") if OUT_FWD == None: - out_fwd = f"{IN_FWD.split('/')[-1].split('.')[0]}.r1.fq.gz" + out_fwd = os.path.join(os.getcwd(), Path(IN_FWD).stem + ".r1.fq.gz") else: out_fwd = OUT_FWD @@ -261,30 +149,34 @@ def check_remove_mode(mode): write_mode = "w" remove_mode = check_remove_mode(MODE) - BAMFILE = pysam.AlignmentFile(BAM, 'r') - # FORWARD OR SE FILE - print(f"- Extracting mapped reads from {BAM}") - mapped_reads = extract_mapped(proc=PROC) - print(f"- Parsing forward fq file {IN_FWD}") - fqd_fwd = parse_fq(IN_FWD) - print("- Cross referencing mapped reads in forward fq") - fq_dict_fwd = get_mapped_reads(fqd_fwd, mapped_reads) - # print(fq_dict_fwd) - print(f"- Writing forward fq to {out_fwd}") - write_fq(fq_dict=fq_dict_fwd, fname=out_fwd, - write_mode=write_mode, remove_mode=remove_mode, proc=PROC) + # FORWARD OR SE FILE + logging.info(f"- Extracting mapped reads from {BAM}") + mapped_reads = extract_mapped(BAM, merged=MERGED) + logging.info(f"- Checking forward fq file {IN_FWD}") + read_write_fq( + fq_in=IN_FWD, + fq_out=out_fwd, + mapped_reads=mapped_reads, + mode=remove_mode, + write_mode=write_mode, + proc=PROC, + ) + logging.info(f"- Cleaned forward FastQ file written to {out_fwd}") # REVERSE FILE if IN_REV: if OUT_REV == None: - out_rev = f"{IN_REV.split('/')[-1].split('.')[0]}.r2.fq.gz" + out_rev = os.path.join(os.getcwd(), Path(IN_REV).stem + ".r2.fq.gz") else: out_rev = OUT_REV - print(f"- Parsing reverse fq file {IN_REV}") - fqd_rev = parse_fq(IN_REV) - print("- Cross referencing mapped reads in reverse fq") - fq_dict_rev = get_mapped_reads(fqd_rev, mapped_reads) - print(f"- Writing reverse fq to {out_rev}") - write_fq(fq_dict=fq_dict_rev, fname=out_rev, - write_mode=write_mode, remove_mode=remove_mode, proc=PROC) + logging.info(f"- Checking reverse fq file {IN_FWD}") + read_write_fq( + fq_in=IN_REV, + fq_out=out_rev, + mapped_reads=mapped_reads, + mode=remove_mode, + write_mode=write_mode, + proc=PROC, + ) + logging.info(f"- Cleaned reverse FastQ file written to {out_rev}") diff --git a/conf/benchmarking_human.config b/conf/benchmarking_human.config index dcd4a55ec..b8cbce224 100644 --- a/conf/benchmarking_human.config +++ b/conf/benchmarking_human.config @@ -12,26 +12,24 @@ params { config_profile_description = "A 'fullsized' benchmarking profile for deepish Human sequencing aDNA data" //Input data - input = 'https://raw.githubusercontent.com/jfy133/test-datasets/eager/testdata/Benchmarking/benchmarking_human.tsv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Benchmarking/benchmarking_human.tsv' // Genome reference fasta = 'https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz' run_bam_filtering = true - bam_discard_unmapped = true bam_unmapped_type = 'discard' bam_mapping_quality_threshold = 30 dedupper = 'markduplicates' run_trim_bam = true - bamutils_clip_left = 1 - bamutils_clip_right = 1 + bamutils_clip_double_stranded_none_udg_left = 1 + bamutils_clip_double_stranded_none_udg_right = 1 // JAR will need to be downloaded first! run_genotyping = true genotyping_tool = 'ug' genotyping_source = 'trimmed' - gatk_ug_jar = 'GenomeAnalysisTK.jar' gatk_call_conf = 20 run_sexdeterrmine = true @@ -41,8 +39,6 @@ params { contamination_chrom_name = 'chrX' run_mtnucratio = true - - } process { diff --git a/conf/benchmarking_vikingfish.config b/conf/benchmarking_vikingfish.config index 765cf1f4d..b7ec39b56 100644 --- a/conf/benchmarking_vikingfish.config +++ b/conf/benchmarking_vikingfish.config @@ -20,7 +20,6 @@ params { bwaalnl = 1024 run_bam_filtering = true - bam_discard_unmapped = true bam_unmapped_type = 'discard' bam_mapping_quality_threshold = 25 diff --git a/docs/images/usage/eager2_metromap_complex.png b/docs/images/usage/eager2_metromap_complex.png index e0bdc3958..3df2e1514 100644 Binary files a/docs/images/usage/eager2_metromap_complex.png and b/docs/images/usage/eager2_metromap_complex.png differ diff --git a/docs/images/usage/eager2_metromap_complex.svg b/docs/images/usage/eager2_metromap_complex.svg index b01d78fe1..1560cf8a0 100644 --- a/docs/images/usage/eager2_metromap_complex.svg +++ b/docs/images/usage/eager2_metromap_complex.svg @@ -5,7 +5,7 @@ viewBox="0 0 316.84646 170.04657" version="1.1" id="svg8" - inkscape:version="1.1.1 (1:1.1+202109281949+c3084ef5ed)" + inkscape:version="1.2 (1:1.2.1+202207142221+cd75a1ee6d)" sodipodi:docname="eager2_metromap_complex.svg" inkscape:export-filename="eager2_metromap_complex.png" inkscape:export-xdpi="300" @@ -145,19 +145,19 @@ borderopacity="1.0" inkscape:pageopacity="0.0" inkscape:pageshadow="2" - inkscape:zoom="0.55881334" - inkscape:cx="695.2232" - inkscape:cy="571.74726" + inkscape:zoom="1.1176267" + inkscape:cx="624.98508" + inkscape:cy="294.37379" inkscape:document-units="mm" inkscape:current-layer="layer1" inkscape:document-rotation="0" - showgrid="false" + showgrid="true" inkscape:snap-bbox="true" inkscape:bbox-nodes="true" inkscape:window-width="1920" - inkscape:window-height="1016" + inkscape:window-height="1043" inkscape:window-x="0" - inkscape:window-y="27" + inkscape:window-y="120" inkscape:window-maximized="1" inkscape:snap-bbox-midpoints="false" inkscape:snap-global="true" @@ -166,7 +166,9 @@ fit-margin-left="0" fit-margin-right="0" fit-margin-bottom="0" - inkscape:pagecheckerboard="0"> + inkscape:pagecheckerboard="0" + inkscape:showpageshadow="2" + inkscape:deskcolor="#d1d1d1"> (Human) + (trimmed only) **NB:** The FastQC (pre-Trimming) plots displayed in the MultiQC report shows *untrimmed* reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the FastQC (post-Trimming) section. You should expect after AdapterRemoval, that most of the artefacts are removed. +> **NB:** The FastQC (pre-Trimming) plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the FastQC (post-Trimming) section. You should expect after AdapterRemoval, that most of the artefacts are removed. > :warning: If you turned on `--post_ar_fastq_trimming` your 'post-Trimming' report the statistics _after_ this trimming. There is no separate report for the post-AdapterRemoval trimming. #### Sequence Counts @@ -284,7 +284,7 @@ You will receive output for each FASTQ file supplied for single end data, or for These stacked bars plots are unfortunately a little confusing, when displayed in MultiQC. However are relatively straight-forward once you understand each category. They can be displayed as counts of reads per AdapterRemoval read-category, or as percentages of the same values. Each forward(/reverse) file combination are displayed once. -The most important value is the **Retained Read Pairs** which gives you the final number of reads output into the file that goes into mapping. Note, however, this section of the stack bar *includes* the other categories displayed (see below) in the calculation. +The most important value is the **Retained Read Pairs** which gives you the final number of reads output into the file that goes into mapping. Note, however, this section of the stack bar _includes_ the other categories displayed (see below) in the calculation. Other Categories: @@ -323,7 +323,7 @@ With paired-end ancient DNA sequencing runs You expect to see a slight increase This module provides information on mapping when running the Bowtie2 aligner. Bowtie2, like bwa, takes raw FASTQ reads and finds the most likely place on the reference genome it derived from. While this module is somewhat redundant with the [Samtools](#samtools) (which reports mapping statistics for bwa) and the endorSp.y endogenous DNA value in the general statistics table, it does provide some details that could be useful in certain contexts. -You will receive output for each *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes in one value. +You will receive output for each _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes in one value. #### Single/Paired-end alignments @@ -343,7 +343,7 @@ The main additional useful information compared to [Samtools](#samtools) is that MALT is a metagenomic aligner (equivalent to BLAST, but much faster). It produces direct alignments of sequencing reads in a reference genome. It is often used for metagenomic profiling or pathogen screening, and specifically in nf-core/eager, of off-target reads from genome mapping. -You will receive output for each *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes and sequencing configurations in one value. +You will receive output for each _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes and sequencing configurations in one value. #### Metagenomic Mappability @@ -378,7 +378,7 @@ Kraken is another metagenomic classifier, but takes a different approach to alig It is useful when you do not have large computing power or you want very rapid but rough approximation of the metagenomic profile of your sample. -You will receive output for each *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes and sequencing configurations in one value. +You will receive output for each _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes and sequencing configurations in one value. #### Top Taxa @@ -396,7 +396,7 @@ However for screening for specific metagenomic profiles, such as ancient microbi This module provides numbers in raw counts of the mapping of your DNA reads to your reference genome. -You will receive output for each *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes in one value. +You will receive output for each _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes in one value. #### Flagstat Plot @@ -416,7 +416,7 @@ The remaining rows will be 0 when running `bwa aln` as these characteristics of ### DeDup -You will receive output for each *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. +You will receive output for each _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. #### Background @@ -476,7 +476,7 @@ There are two algorithms from the tools we use: `c_curve` and `lc_extrap`. The f Due to endogenous DNA being so low when doing initial screening, the maths behind `lc_extrap` often fails as there is not enough data. Therefore nf-core/eager sticks with `c_curve` which gives a similar approximation of the library complexity, but is more robust to smaller datasets. -You will receive output for each deduplicated *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. +You will receive output for each deduplicated _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. #### Complexity Curve @@ -506,7 +506,7 @@ Therefore, three main characteristics of ancient DNA are: * Elevated G and As (purines) just before strand breaks * Increased C and Ts at ends of fragments -You will receive output for each deduplicated *library*. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. +You will receive output for each deduplicated _library_. This means that if you use TSV input and have one library sequenced over multiple lanes and sequencing types, these are merged and you will get mapping statistics of all lanes of the library in one value. #### Misincorporation Plots @@ -547,7 +547,7 @@ Qualimap is a tool which provides statistics on the quality of the mapping of yo Note that many of the statistics from this module are displayed in the General Stats table (see above), as they represent single values that are not plottable. -You will receive output for each *sample*. This means you will statistics of deduplicated values of all types of libraries combined in a single value (i.e. non-UDG treated, full-UDG, paired-end, single-end all together). +You will receive output for each _sample_. This means you will statistics of deduplicated values of all types of libraries combined in a single value (i.e. non-UDG treated, full-UDG, paired-end, single-end all together). :warning: If your library has no reads mapping to the reference, this will result in an empty BAM file. Qualimap will therefore not produce any output even if a BAM exists! @@ -670,7 +670,7 @@ If you ran with `--min_allele_freq_hom` and `--min_allele_freq_het` set to the s ## Output Files -This section gives a brief summary of where to look for what files for downstream analysis. This covers *all* modules. +This section gives a brief summary of where to look for what files for downstream analysis. This covers _all_ modules. Each module has it's own output directory which sit alongside the `MultiQC/` directory from which you opened the report. @@ -679,7 +679,7 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir * `fastqc/`: this contains the original per-FASTQ FastQC reports that are summarised with MultiQC. These occur in both `html` (the report) and `.zip` format (raw data). The `after_clipping` folder contains the same but for after AdapterRemoval. * `adapterremoval/`: this contains the log files (ending with `.settings`) with raw trimming (and merging) statistics after AdapterRemoval. In the `output` sub-directory, are the output trimmed (and merged) `fastq` files. These you can use for downstream applications such as taxonomic binning for metagenomic studies. * `post_ar_fastq_trimmed`: this contains `fastq` files that have been additionally trimmed after AdapterRemoval (if turned on). These reads are usually that had internal barcodes, or damage that needed to be removed before mapping. -* `mapping/`: this contains a sub-directory corresponding to the mapping tool you used, inside of which will be the initial BAM files containing the reads that mapped to your reference genome with no modification (see below). You will also find a corresponding BAM index file (ending in `.csi` or `.bam`), and if running the `bowtie2` mapper: a log ending in `_bt2.log`. You can use these for downstream applications e.g. if you wish to use a different de-duplication tool not included in nf-core/eager (although please feel free to add a new module request on the Github repository's [issue page](https://github.com/nf-core/eager/issues)!). +* `mapping/`: this contains a sub-directory corresponding to the mapping tool you used, inside of which will be the initial BAM files containing the reads that mapped to your reference genome with no modification (see below). You will also find a corresponding BAM index file (ending in `.csi` or `.bai`), and if running the `bowtie2` mapper: a log ending in `_bt2.log`. You can use these for downstream applications e.g. if you wish to use a different de-duplication tool not included in nf-core/eager (although please feel free to add a new module request on the Github repository's [issue page](https://github.com/nf-core/eager/issues)!). * `samtools/`: this contains two sub-directories. `stats/` contain the raw mapping statistics files (ending in `.stats`) from directly after mapping. `filter/` contains BAM files that have had a mapping quality filter applied (set by the `--bam_mapping_quality_threshold` flag) and a corresponding index file. Furthermore, if you selected `--bam_discard_unmapped`, you will find your separate file with only unmapped reads in the format you selected. Note unmapped read BAM files will _not_ have an index file. * `deduplication/`: this contains a sub-directory called `dedup/`, inside here are sample specific directories. Each directory contains a BAM file containing mapped reads but with PCR duplicates removed, a corresponding index file and two stats file. `.hist.` contains raw data for a deduplication histogram used for tools like preseq (see below), and the `.log` contains overall summary deduplication statistics. * `endorSpy/`: this contains all JSON files exported from the endorSpy endogenous DNA calculation tool. The JSON files are generated specifically for display in the MultiQC general statistics table and is otherwise very likely not useful for you. @@ -697,10 +697,11 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir * `metagenomic_complexity_filter`: this contains the output from filtering of input reads to metagenomic classification of low-sequence complexity reads as performed by `bbduk`. This will include the filtered FASTQ files (`*_lowcomplexityremoved.fq.gz`) and also the run-time log (`_bbduk.stats`) for each sample. **Note:** there are no sections in the MultiQC report for this module, therefore you must check the `._bbduk.stats` files to get summary statistics of the filtering. * `metagenomic_classification/`: this contains the output for a given metagenomic classifier. * Running MALT will contain RMA6 files that can be loaded into MEGAN6 or MaltExtract for phylogenetic visualisation of read taxonomic assignments and aDNA characteristics respectively. Additional a `malt.log` file is provided which gives additional information such as run-time, memory usage and per-sample statistics of numbers of alignments with taxonomic assignment etc. This will also include gzip SAM files if requested. - * Running kraken will contain the Kraken output and report files, as well as a merged Taxon count table. You will also get a Kraken kmer duplication table, in a [KrakenUniq](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1568-0) fashion. This is very useful to check for breadth of coverage and detect read stacking. A small number of aligned reads (low coverage) and a kmer duplication >1 is usually a sign of read stacking, usually indicative of a false positive hit (e.g. from over-amplified libraries). *Kmer duplication is defined as: number of kmers / number of unique kmers*. You will find two kraken reports formats available: + * Running kraken will contain the Kraken output and report files, as well as a merged Taxon count table. You will also get a Kraken kmer duplication table, in a [KrakenUniq](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1568-0) fashion. This is very useful to check for breadth of coverage and detect read stacking. A small number of aligned reads (low coverage) and a kmer duplication >1 is usually a sign of read stacking, usually indicative of a false positive hit (e.g. from over-amplified libraries). _Kmer duplication is defined as: number of kmers / number of unique kmers_. You will find two kraken reports formats available: * the `*.kreport` which is the old report format, without distinct minimizer count information, used by some tools such as [Pavian](https://github.com/fbreitwieser/pavian) * the `*.kraken2_report` which is the new kraken report format, with the distinct minimizer count information. * finally, the `*.kraken.out` file are the direct output of Kraken2 + * ⚠️ If your sample has no hits, no kraken output files will be created for that sample! * `maltextract/`: this contains a `results` directory in which contains the output from MaltExtract - typically one folder for each filter type, an error and a log file. The characteristics of each node (e.g. damage, read lengths, edit distances - each in different txt formats) can be seen in each sub-folder of the filter folders. Output can be visualised either with the [HOPS postprocessing script](https://github.com/rhuebler/HOPS) or [MEx-IPA](https://github.com/jfy133/MEx-IPA) * `consensus_sequence/`: this contains three FASTA files from VCF2Genome of a consensus sequence based on the reference FASTA with each sample's unique modifications. The main FASTA is a standard file with bases not passing the specified thresholds as Ns. The two other FASTAS (`_refmod.fasta.gz`) and (`_uncertainity.fasta.gz`) are IUPAC uncertainty codes (rather than Ns) and a special number-based uncertainty system used for other downstream tools, respectively. `merged_bams/initial`: these contain the BAM files that would go into UDG-treatment specific BAM trimming. All libraries of the sample sample, **and** same UDG-treatment type will be in these BAM files. diff --git a/docs/usage.md b/docs/usage.md index 454b10a93..133f986fc 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -115,7 +115,7 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof * A profile with a complete configuration for automated testing * Includes links to test data so needs no other parameters -> *Important*: If running nf-core/eager on a cluster - ask your system +> _Important_: If running nf-core/eager on a cluster - ask your system > administrator what profile to use. **Institution Specific Profiles** These are profiles specific to certain **HPC @@ -349,7 +349,7 @@ Note the following important points and limitations for setting up: * The TSV must use actual tabs (not spaces) between cells. * The input FASTQ filenames are discarded after FastQC, all other downstream results files are based on `Sample_Name`, `Library_ID` and `Lane` columns for filenames. -* *File* names must be unique regardless of file path, due to risk of over-writing (see: [https://github.com/nextflow-io/nextflow/issues/470](https://github.com/nextflow-io/nextflow/issues/470)). +* _File_ names must be unique regardless of file path, due to risk of over-writing (see: [https://github.com/nextflow-io/nextflow/issues/470](https://github.com/nextflow-io/nextflow/issues/470)). * At different stages of the merging process, (as above) nf-core/eager will use as output filenames the information from the `Sample_Name`, `Library_ID` and/or `Lane` columns for filenames. * Library_IDs must be unique (other than if they are spread across multiple lanes). For example, your .tsv file must not have rows with both the strings in the Library_ID column as `Library1` and `Library1`, for **both** `SampleA` and `SampleB` in the Sample_ID column, otherwise the two `Library1.fq.gz` files may result in a filename collision. * If it is 'too late' and you already have duplicated FASTQ file names before starting a run, a workaround is to concatenate the FASTQ files together and supply this to a nf-core/eager run. The only downside is that you will not get independent FASTQC results for each file. @@ -531,7 +531,7 @@ and investigate the log and error messages that are produced by each command of the process. For example, in the error in -[1a](#1a-Nextflow-reports-an-error-executing-process-with-command-error) you can +[1a](#1a-nextflow-reports-an-error-executing-process-with-command-error) you can see the following line ```bash @@ -586,7 +586,7 @@ the #eager channel). #### Tutorial Profiles - Background -A useful feature of Nextflow is the ability to use configuration *profiles* that +A useful feature of Nextflow is the ability to use configuration _profiles_ that can specify many default parameters and other settings on how to run your pipeline. @@ -617,7 +617,7 @@ levels in terms of memory usage, pipeline-level profiles can also assist in facilitating reproducible science by giving a way for researchers to 'publish' their exact pipeline parameters in way other users can automatically re-run the pipeline with the pipeline parameters used in the original publication but on -their *own* cluster. +their _own_ cluster. To illustrate this, lets say we analysed our data on a HPC called 'blue' for which an institutional profile already exists, and for our analysis we defined a @@ -689,7 +689,7 @@ defined in the `cluster` profile. > institutional-level profiles. Otherwise please skip to [Writing your own profile](#tutorial-profiles---writing-your-own-profile) In actuality, a nf-core/eager run already contains many configs and profiles, -and will normally use *multiple* configs profiles in a single run. Multiple +and will normally use _multiple_ configs profiles in a single run. Multiple configuration and profiles files can be used, and each new one selected will inherit all the previous one's parameters, and the parameters in the new one will then overwrite any that have been changed from the original. @@ -727,7 +727,7 @@ nextflow run nf-core/eager -c old_dna_profile.config -profile hpc_blue,old_dna < In the background, any parameters in the pipeline's `nextflow.config` (containing default parameters) will be overwritten by the -`old_dna_profile.config`. In addition, the `old_dna` *profile* will overwrite +`old_dna_profile.config`. In addition, the `old_dna` _profile_ will overwrite any parameters set in the config but outside the profile definition of `old_dna_profile.config`. @@ -764,7 +764,7 @@ if your run does not use the parameters you expect. > specifying a custom `.config` file by using `-C` (capital C) instead of `-c` > (which inherits previously specify parameters) -Another thing that is important to note is that if a specific *profile* is +Another thing that is important to note is that if a specific _profile_ is specified in `nextflow run`, this replaces any 'global' parameter that is specified within the config file (but outside a profile) itself - **regardless** of profile order (see above). @@ -1447,7 +1447,7 @@ signal drop or want to log off, Nextflow will not crash. #### Tutorial Human Pop-Gen - Results Assuming the run completed without any crashes (if problems do occur, check -against [#usage](#pipeline-options) that all parameters are as expected, or +against [parameters](https://nf-core/eager/parameters) that all parameters are as expected, or check the [FAQ](#troubleshooting-and-faqs)), we can now check our results in `results/`. @@ -1703,7 +1703,7 @@ each `Lane`, but the `Sample_Name` and `Library_ID` columns identify and group them together accordingly. Secondly, as we have NextSeq data, we have specified we have `2` for `Colour_Chemistry`, which is important for downstream processing (see below). The other columns are less important for this particular context of -metagenomic screening. See the nf-core/eager [usage](#pipeline-options) +metagenomic screening. See the nf-core/eager [parameters](https://nf-core/eager/parameters) documentation for more specifications on how to set up a TSV file (e.g. why despite NextSeqs only having 4 lanes, we go up to 8 in the example above). @@ -1806,7 +1806,7 @@ nextflow run nf-core/eager \ nf-core/eager will now take all unmapped reads after mapping and convert the BAM file back to FASTQ, which can be accepted by MALT. But of course, we also then need to tell nf-core/eager we actually want to run MALT. We will also specify -the location of the [pre-built database](#preparation) and which 'min support' +the location of the [pre-built database](#tutorial-metagenomics---preparation) and which 'min support' method we want to use (this specifies the minimum number of alignments is needed to a particular taxonomic node to be 'kept' in the MALT output files). Otherwise we will keep all other parameters as default. For example using BlastN mode, @@ -1878,7 +1878,7 @@ Porphyromonas ``` We have also specified the path to the HOPS resources [downloaded -earlier](#preparation), and that I want to turn off 'destacking' (removal of any +earlier](#tutorial-metagenomics---preparation), and that I want to turn off 'destacking' (removal of any read that overlaps the positions of another - something only recommended to keep on when you have high coverage data). @@ -1889,7 +1889,7 @@ signal drop or want to log off, Nextflow will not crash. #### Tutorial Metagenomics - Results Assuming the run completed without any crashes (if problems do occur, check -against [usage](#pipeline-options) that all parameters are as expected, or check +against [parameters](https://nf-core/eager/parameters) that all parameters are as expected, or check the [FAQ](#troubleshooting-and-faqs)), we can now check our results in `results/`. @@ -2515,7 +2515,7 @@ signal drop or want to log off, Nextflow will not crash. #### Tutorial Pathogen Genomics - Results Assuming the run completed without any crashes (if problems do occur, check -against [#usage](#pipeline-options) that all parameters are as expected, or +against [parameters](https://nf-core/eager/parameters) that all parameters are as expected, or check the [FAQ](#troubleshooting-and-faqs)), we can now check our results in `results/`. diff --git a/environment.yml b/environment.yml index 05acf03b2..0676e6578 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,6 @@ # You can use this file to create a conda environment for this pipeline: # conda env create -f environment.yml -name: nf-core-eager-2.4.4 +name: nf-core-eager-2.4.5 channels: - conda-forge - bioconda @@ -26,7 +26,7 @@ dependencies: - bioconda::qualimap=2.2.2d - bioconda::vcf2genome=0.91 - bioconda::damageprofiler=0.4.9 # Don't upgrade - later versions don't allow java 8 - - bioconda::multiqc=1.12 + - bioconda::multiqc=1.13a - bioconda::pmdtools=0.60 - bioconda::bedtools=2.30.0 - conda-forge::libiconv=1.16 diff --git a/main.nf b/main.nf index f5d38fe45..b91bc31a2 100644 --- a/main.nf +++ b/main.nf @@ -10,6 +10,7 @@ For a list of authors and contributors, see: https://github.com/nf-core/eager/tree/dev#authors-alphabetical ------------------------------------------------------------------------------------------------------------ */ +nextflow.enable.dsl=1 log.info Headers.nf_core(workflow, params.monochrome_logs) @@ -702,7 +703,7 @@ process fastqc { """ } else { """ - fastqc -q $r1 + fastqc -t ${task.cpus} -q $r1 rename 's/_fastqc\\.zip\$/_raw_fastqc.zip/' *_fastqc.zip rename 's/_fastqc\\.html\$/_raw_fastqc.html/' *_fastqc.html """ @@ -768,7 +769,7 @@ ch_input_for_fastp.fourcol [ samplename, libraryid, lane, seqtype, organism, strandedness, udg, r1, r2 ] } - .set { ch_skipfastp_for_merge } + .set { ch_skipfastp_for_merge } ch_output_from_fastp .map{ @@ -799,7 +800,7 @@ process adapter_removal { input: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(r1), file(r2) from ch_fastp_for_adapterremoval - path adapterlist from ch_adapterlist.collect().dump(tag: "Adapter list") + path adapterlist from ch_adapterlist.collect() output: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("output/*{combined.fq,.se.truncated,pair1.truncated}.gz") into ch_output_from_adapterremoval_r1 @@ -967,13 +968,10 @@ if ( params.skip_collapse ){ // AdapterRemoval bypass when not running it if (!params.skip_adapterremoval) { ch_output_from_adapterremoval.mix(ch_fastp_for_skipadapterremoval) - .dump(tag: "post_ar_adapterremoval_decision_skipar") .filter { it =~/.*combined.fq.gz|.*truncated.gz/ } - .dump(tag: "ar_bypass") .into { ch_adapterremoval_for_post_ar_trimming; ch_adapterremoval_for_skip_post_ar_trimming; } } else { ch_fastp_for_skipadapterremoval - .dump(tag: "post_ar_adapterremoval_decision_withar") .into { ch_adapterremoval_for_post_ar_trimming; ch_adapterremoval_for_skip_post_ar_trimming; } } @@ -1075,7 +1073,6 @@ ch_branched_for_lanemerge = ch_inlinebarcoderemoval_for_lanemerge [ samplename, libraryid, lane, seqtype, organism, strandedness, udg, r1, r2 ] } - .dump(tag: "lanemerge_bypass_decision") .branch { skip_merge: it[7].size() == 1 // Can skip merging if only single lanes merge_me: it[7].size() > 1 @@ -1096,7 +1093,6 @@ ch_branched_for_lanemerge_skipme = ch_branched_for_lanemerge.skip_merge [ samplename, libraryid, lane, seqtype, organism, strandedness, udg, r1, r2 ] } - .dump(tag: "lanemerge_reconfigure") ch_branched_for_lanemerge_ready = ch_branched_for_lanemerge.merge_me @@ -1124,7 +1120,7 @@ process lanemerge { publishDir "${params.outdir}/lanemerging", mode: params.publish_dir_mode input: - tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(r1), path(r2) from ch_branched_for_lanemerge_ready.dump(tag: "lange_merge_input") + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(r1), path(r2) from ch_branched_for_lanemerge_ready output: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*_R1_lanemerged.fq.gz") into ch_lanemerge_for_mapping_r1 @@ -1132,7 +1128,7 @@ process lanemerge { script: if ( seqtype == 'PE' && ( params.skip_collapse || params.skip_adapterremoval ) ){ - lane = 0 + def lane = 0 """ cat ${r1} > "${libraryid}"_R1_lanemerged.fq.gz cat ${r2} > "${libraryid}"_R2_lanemerged.fq.gz @@ -1148,7 +1144,6 @@ process lanemerge { // Ensuring always valid R2 file even if doesn't exist for AWS if ( ( params.skip_collapse || params.skip_adapterremoval ) ) { ch_lanemerge_for_mapping_r1 - .dump(tag: "post_lanemerge_reconfigure") .mix(ch_lanemerge_for_mapping_r2) .groupTuple(by: [0,1,2,3,4,5,6]) .map{ @@ -1263,8 +1258,8 @@ process bwa { publishDir "${params.outdir}/mapping/bwa", mode: params.publish_dir_mode input: - tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(r1), path(r2) from ch_lanemerge_for_bwa.dump(tag: "bwa_input_reads") - path index from bwa_index.collect().dump(tag: "input_index") + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(r1), path(r2) from ch_lanemerge_for_bwa + path index from bwa_index.collect() output: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*.mapped.bam"), path("*.{bai,csi}") into ch_output_from_bwa @@ -1342,7 +1337,6 @@ process circulargenerator{ else null } - input: file fasta from ch_fasta_for_circulargenerator @@ -1354,7 +1348,7 @@ process circulargenerator{ params.mapper == 'circularmapper' script: - prefix = "${fasta.baseName}_${params.circularextension}.fasta" + prefix = "${fasta.baseName}_${params.circularextension}.${fasta.extension}" """ circulargenerator -Xmx${task.memory.toGiga()}g -e ${params.circularextension} -i $fasta -s ${params.circulartarget} bwa index $prefix @@ -1381,7 +1375,7 @@ process circularmapper{ script: def filter = params.circularfilter ? '-f true -x true' : '' - def elongated_root = "${fasta.baseName}_${params.circularextension}.fasta" + def elongated_root = "${fasta.baseName}_${params.circularextension}.${fasta.extension}" def size = params.large_ref ? '-c' : '' if (!params.single_end && params.skip_collapse ){ @@ -1537,18 +1531,19 @@ process hostremoval_input_fastq { tuple samplename, libraryid, seqtype, organism, strandedness, udg, file("*.fq.gz") into ch_output_from_hostremovalfastq script: + def merged = params.skip_collapse ? "": "-merged" if ( seqtype == 'SE' ) { out_fwd = bam.baseName+'.hostremoved.fq.gz' """ samtools index $bam - extract_map_reads.py $bam ${r1} -m ${params.hostremoval_mode} -of $out_fwd -p ${task.cpus} + extract_map_reads.py $bam ${r1} -m ${params.hostremoval_mode} $merged -of $out_fwd -t ${task.cpus} """ } else { out_fwd = bam.baseName+'.hostremoved.fwd.fq.gz' out_rev = bam.baseName+'.hostremoved.rev.fq.gz' """ samtools index $bam - extract_map_reads.py $bam ${r1} -rev ${r2} -m ${params.hostremoval_mode} -of $out_fwd -or $out_rev -p ${task.cpus} + extract_map_reads.py $bam ${r1} -rev ${r2} -m ${params.hostremoval_mode} $merged -of $out_fwd -or $out_rev -t ${task.cpus} """ } @@ -1563,7 +1558,7 @@ ch_branched_for_seqtypemerge = ch_mapping_for_seqtype_merging it -> def samplename = it[0] def libraryid = it[1] - def lane = it[2] + def lane = 0 def seqtype = it[3].unique() // How to deal with this? def organism = it[4] def strandedness = it[5] @@ -1571,9 +1566,13 @@ ch_branched_for_seqtypemerge = ch_mapping_for_seqtype_merging def r1 = it[7] def r2 = it[8] - // We will assume if mixing it is better to set as PE as this is informative + // 1. We will assume if mixing it is better to set as PE as this is informative // for DeDup (and markduplicates doesn't care), but will throw a warning! - def seqtype_new = seqtype.flatten().size() > 1 ? 'PE' : seqtype + // 2. We will also flatten to a single value to address problems with 'unstable' + // Nextflow ArrayBag object types not allowing the .join to work between resumes + // See: https://github.com/nf-core/eager/issues/880 + + def seqtype_new = seqtype.flatten().size() > 1 ? 'PE' : seqtype.flatten()[0] if ( seqtype.flatten().size() > 1 && params.dedupper == 'dedup' ) { log.warn "[nf-core/eager] Warning: you are running DeDup on BAMs with a mixture of PE/SE data for library: ${libraryid}. DeDup is designed for PE data only, deduplication maybe suboptimal!" @@ -1582,7 +1581,6 @@ ch_branched_for_seqtypemerge = ch_mapping_for_seqtype_merging [ samplename, libraryid, lane, seqtype_new, organism, strandedness, udg, r1, r2 ] } - .dump(tag: "pre_seqtype_decision") .branch { skip_merge: it[7].size() == 1 // Can skip merging if only single lanes merge_me: it[7].size() > 1 @@ -1790,11 +1788,12 @@ if (params.run_bam_filtering) { def seqtype = it[3] def organism = it[4] def strandedness = it[5] - def udg = it[6] + def udg = it[6] def stats = file(it[7]) def poststats = file("$projectDir/assets/nf-core_eager_dummy.txt") - [samplename, libraryid, lane, seqtype, organism, strandedness, udg, stats, poststats ] } + [samplename, libraryid, lane, seqtype, organism, strandedness, udg, stats, poststats ] + } .set{ ch_allflagstats_for_endorspy } } @@ -1955,7 +1954,6 @@ ch_input_for_librarymerging.merge_me [it[0], libraryid, it[2], seqtype, it[4], it[5], it[6], bam, bai ] } - .dump(tag: "input_for_lib_merging") .set { ch_fixedinput_for_librarymerging } process library_merge { @@ -1964,7 +1962,7 @@ process library_merge { publishDir "${params.outdir}/merged_bams/initial", mode: params.publish_dir_mode input: - tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(bam), file(bai) from ch_fixedinput_for_librarymerging.dump(tag: "library_merge_input") + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(bam), file(bai) from ch_fixedinput_for_librarymerging output: tuple samplename, val("${samplename}_libmerged"), lane, seqtype, organism, strandedness, udg, path("*_libmerged_rmdup.bam"), path("*_libmerged_rmdup.bam.{bai,csi}") into ch_output_from_librarymerging @@ -2232,7 +2230,7 @@ process bam_trim { tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(bam), path(bai) from ch_bamutils_decision.totrim output: - tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.trimmed.bam"), file("*.trimmed.bam.{bai,csi}") into ch_trimmed_from_bamutils + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*.trimmed.bam"), path("*.trimmed.bam.{bai,csi}") into ch_trimmed_from_bamutils script: def softclip = params.bamutils_softclip ? '-c' : '' @@ -2264,7 +2262,7 @@ ch_trimmed_formerge = ch_bamutils_decision.notrim def seqtype = it[3] def organism = it[4] def strandedness = it[5] - def udg = it[6] + def udg = it[6] def bam = it[7].flatten() def bai = it[8].flatten() @@ -2490,10 +2488,36 @@ ch_damagemanipulation_for_genotyping_pileupcaller // Create pileupcaller input tuples ch_input_for_genotyping_pileupcaller.singleStranded .groupTuple(by:[5]) + .map{ + def samplename = it[0] + def libraryid = it[1] + def lane = it[2] + def seqtype = it[3] + def organism = it[4] + def strandedness = it[5] + def udg = it[6] + def bam = it[7].flatten() + def bai = it[8].flatten() + + [samplename, libraryid, lane, seqtype, organism, strandedness, udg, bam, bai ] + } .set {ch_prepped_for_pileupcaller_single} ch_input_for_genotyping_pileupcaller.doubleStranded .groupTuple(by:[5]) + .map{ + def samplename = it[0] + def libraryid = it[1] + def lane = it[2] + def seqtype = it[3] + def organism = it[4] + def strandedness = it[5] + def udg = it[6] + def bam = it[7].flatten() + def bai = it[8].flatten() + + [samplename, libraryid, lane, seqtype, organism, strandedness, udg, bam, bai ] + } .set {ch_prepped_for_pileupcaller_double} process genotyping_pileupcaller { @@ -2505,12 +2529,12 @@ process genotyping_pileupcaller { params.run_genotyping && params.genotyping_tool == 'pileupcaller' input: - tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, bam, bai from ch_prepped_for_pileupcaller_double.mix(ch_prepped_for_pileupcaller_single) + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(bam), path(bai) from ch_prepped_for_pileupcaller_double.mix(ch_prepped_for_pileupcaller_single) file fasta from ch_fasta_for_genotyping_pileupcaller.collect() file fai from ch_fai_for_pileupcaller.collect() file dict from ch_dict_for_pileupcaller.collect() path(bed) from ch_bed_for_pileupcaller.collect() - path(snp) from ch_snp_for_pileupcaller.collect().dump(tag: "pileupcaller_snp_file") + path(snp) from ch_snp_for_pileupcaller.collect() output: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("pileupcaller.${strandedness}.*") into ch_for_eigenstrat_snp_coverage @@ -2541,7 +2565,7 @@ process eigenstrat_snp_coverage { params.run_genotyping && params.genotyping_tool == 'pileupcaller' input: - tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*") from ch_for_eigenstrat_snp_coverage.dump(tag:'eigenstrat_input') + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*") from ch_for_eigenstrat_snp_coverage output: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*.json") into ch_eigenstrat_snp_cov_for_multiqc @@ -2672,7 +2696,7 @@ process vcf2genome { if (!params.additional_vcf_files) { ch_vcfs_for_multivcfanalyzer = ch_ug_for_multivcfanalyzer.map{ it[-1] }.collect() } else { - ch_vcfs_for_multivcfanalyzer = ch_ug_for_multivcfanalyzer.map{ it[-1] }.mix(ch_extravcfs_for_multivcfanalyzer).collect().dump(tag: "postmix") + ch_vcfs_for_multivcfanalyzer = ch_ug_for_multivcfanalyzer.map{ it[-1] }.mix(ch_extravcfs_for_multivcfanalyzer).collect() } process multivcfanalyzer { @@ -3018,8 +3042,8 @@ process kraken { path(krakendb) from ch_krakendb output: - file "*.kraken.out" into ch_kraken_out - tuple prefix, path("*.kraken2_report") into ch_kraken_report, ch_kraken_for_multiqc + file "*.kraken.out" optional true into ch_kraken_out + tuple prefix, path("*.kraken2_report") optional true into ch_kraken_report, ch_kraken_for_multiqc script: prefix = fastq.baseName @@ -3112,31 +3136,31 @@ process get_software_versions { echo $workflow.manifest.version &> v_pipeline.txt echo $workflow.nextflow.version &> v_nextflow.txt - fastqc --version &> v_fastqc.txt 2>&1 || true + fastqc -t ${task.cpus} --version &> v_fastqc.txt 2>&1 || true AdapterRemoval --version &> v_adapterremoval.txt 2>&1 || true fastp --version &> v_fastp.txt 2>&1 || true bwa &> v_bwa.txt 2>&1 || true - circulargenerator --help | head -n 1 &> v_circulargenerator.txt 2>&1 || true + circulargenerator -Xmx${task.memory.toGiga()}g --help | head -n 1 &> v_circulargenerator.txt 2>&1 || true samtools --version &> v_samtools.txt 2>&1 || true - dedup -v &> v_dedup.txt 2>&1 || true + dedup -Xmx${task.memory.toGiga()}g -v &> v_dedup.txt 2>&1 || true ## bioconda recipe of picard is incorrectly set up and extra warning made with stderr, this ugly command ensures only version exported - ( exec 7>&1; picard MarkDuplicates --version 2>&1 >&7 | grep -v '/' >&2 ) 2> v_markduplicates.txt || true - qualimap --version &> v_qualimap.txt 2>&1 || true + ( exec 7>&1; picard -Xmx${task.memory.toMega()}M MarkDuplicates --version 2>&1 >&7 | grep -v '/' >&2 ) 2> v_markduplicates.txt || true + qualimap --version --java-mem-size=${task.memory.toGiga()}G &> v_qualimap.txt 2>&1 || true preseq &> v_preseq.txt 2>&1 || true - gatk --version 2>&1 | grep '(GATK)' > v_gatk.txt 2>&1 || true - gatk3 --version 2>&1 | head -n 1 > v_gatk3.txt 2>&1 || true + gatk --java-options "-Xmx${task.memory.toGiga()}G" --version 2>&1 | grep '(GATK)' > v_gatk.txt 2>&1 || true + gatk3 -Xmx${task.memory.toGiga()}g --version 2>&1 | head -n 1 > v_gatk3.txt 2>&1 || true freebayes --version &> v_freebayes.txt 2>&1 || true bedtools --version &> v_bedtools.txt 2>&1 || true - damageprofiler --version &> v_damageprofiler.txt 2>&1 || true + damageprofiler -Xmx${task.memory.toGiga()}g --version &> v_damageprofiler.txt 2>&1 || true bam --version &> v_bamutil.txt 2>&1 || true pmdtools --version &> v_pmdtools.txt 2>&1 || true angsd -h |& head -n 1 | cut -d ' ' -f3-4 &> v_angsd.txt 2>&1 || true - multivcfanalyzer --help | head -n 1 &> v_multivcfanalyzer.txt 2>&1 || true - malt-run --help |& tail -n 3 | head -n 1 | cut -f 2 -d'(' | cut -f 1 -d ',' &> v_malt.txt 2>&1 || true - MaltExtract --help | head -n 2 | tail -n 1 &> v_maltextract.txt 2>&1 || true + multivcfanalyzer -Xmx${task.memory.toGiga()}g --help | head -n 1 &> v_multivcfanalyzer.txt 2>&1 || true + malt-run -J-Xmx${task.memory.toGiga()}g --help |& tail -n 3 | head -n 1 | cut -f 2 -d'(' | cut -f 1 -d ',' &> v_malt.txt 2>&1 || true + MaltExtract -Xmx${task.memory.toGiga()}g --help | head -n 2 | tail -n 1 &> v_maltextract.txt 2>&1 || true multiqc --version &> v_multiqc.txt 2>&1 || true - vcf2genome -h |& head -n 1 &> v_vcf2genome.txt || true - mtnucratio --help &> v_mtnucratiocalculator.txt || true + vcf2genome -Xmx${task.memory.toGiga()}g -h |& head -n 1 &> v_vcf2genome.txt || true + mtnucratio -Xmx${task.memory.toGiga()}g --help &> v_mtnucratiocalculator.txt || true sexdeterrmine --version &> v_sexdeterrmine.txt || true kraken2 --version | head -n 1 &> v_kraken.txt || true endorS.py --version &> v_endorSpy.txt || true @@ -3144,7 +3168,7 @@ process get_software_versions { bowtie2 --version | grep -a 'bowtie2-.* -fdebug' > v_bowtie2.txt || true eigenstrat_snp_coverage --version | cut -d ' ' -f2 >v_eigenstrat_snp_coverage.txt || true mapDamage --version > v_mapdamage.txt || true - bbduk.sh | grep 'Last modified' | cut -d ' ' -f 3-99 > v_bbduk.txt || true + bbversion.sh > v_bbduk.txt || true bcftools --version | grep 'bcftools' | cut -d ' ' -f 2 > v_bcftools.txt || true scrape_software_versions.py &> software_versions_mqc.yaml """ @@ -3319,6 +3343,7 @@ workflow.onComplete { if (workflow.success) { log.info "-${c_purple}[nf-core/eager]${c_green} Pipeline completed successfully${c_reset}-" log.info "-${c_purple}[nf-core/eager]${c_green} MultiQC run report can be found in ${params.outdir}/multiqc ${c_reset}-" + log.info "-${c_purple}[nf-core/eager]${c_green} Further output documentation can be seen at https://nf-core/eager/output ${c_reset}-" } else { checkHostname() log.info "-${c_purple}[nf-core/eager]${c_red} Pipeline completed with errors${c_reset}-" @@ -3340,7 +3365,6 @@ workflow.onError { def extract_data(tsvFile) { Channel.fromPath(tsvFile) .splitCsv(header: true, sep: '\t') - .dump(tag:'tsv_extract') .map { row -> def expected_keys = ['Sample_Name', 'Library_ID', 'Lane', 'Colour_Chemistry', 'SeqType', 'Organism', 'Strandedness', 'UDG_Treatment', 'R1', 'R2', 'BAM'] diff --git a/nextflow.config b/nextflow.config index 86747172b..28ab8c132 100644 --- a/nextflow.config +++ b/nextflow.config @@ -284,7 +284,7 @@ params { // Container slug. Stable releases should specify release tag! // Developmental code should specify :dev -process.container = 'nfcore/eager:2.4.4' +process.container = 'nfcore/eager:2.4.5' // Load base.config by default for all pipelines includeConfig 'conf/base.config' @@ -414,7 +414,7 @@ manifest { description = 'A fully reproducible and state-of-the-art ancient DNA analysis pipeline' mainScript = 'main.nf' nextflowVersion = '>=20.07.1' - version = '2.4.4' + version = '2.4.5' } // Function to ensure that resource requirements don't go beyond diff --git a/nextflow_schema.json b/nextflow_schema.json index ff998a4b7..d93b38233 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -475,7 +475,7 @@ "default": 30, "description": "Specify read minimum length to be kept for downstream analysis.", "fa_icon": "fas fa-ruler", - "help_text": "Defines the minimum read length that is required for reads after merging to be considered for downstream analysis after read merging. Default is `30`.\n\nNote that performing read length filtering at this step is not reliable for correct endogenous DNA calculation, when you have a large percentage of very short reads in your library - such as retrieved in single-stranded library protocols. When you have very few reads passing this length filter, it will artificially inflate your endogenous DNA by creating a very small denominator. In these cases it is recommended to set this to 0, and use `--bam_filter_minreadlength` instead, to filter out 'un-usable' short reads after mapping.\n\n> Modifies AdapterRemoval parameter: `--minlength`\n" + "help_text": "Defines the minimum read length that is required for reads after merging to be considered for downstream analysis after read merging. Default is `30`.\n\nNote that when you have a large percentage of very short reads in your library (< 20 bp) - such as retrieved in single-stranded library protocols - that performing read length filtering at this step is not _always_ reliable for correct endogenous DNA calculation. When you have very few reads passing this length filter, it will artificially inflate your 'endogenous DNA' value by creating a very small denominator. \n\nIf you notice you have ultra short reads (< 20 bp), it is recommended to set this parameter to 0, and use `--bam_filter_minreadlength` instead, to filter out 'un-usable' short reads after mapping. A caveat, however, is that this will cause a very large increase in computational run time, due to all reads in the library will be being mapped.\n\n> Modifies AdapterRemoval parameter: `--minlength`\n" }, "clip_min_read_quality": { "type": "integer", @@ -653,28 +653,28 @@ "description": "Specify the -N parameter for bowtie2 (mismatches in seed). This will override defaults from alignmode/sensitivity.", "fa_icon": "fas fa-sort-numeric-down", "help_text": "The number of mismatches allowed in the seed during seed-and-extend procedure of Bowtie2. This will override any values set with `--bt2_sensitivity`. Can either be 0 or 1. Default: 0 (i.e. use`--bt2_sensitivity` defaults).\n\n> Modifies Bowtie2 parameters: `-N`", - "enum": [ - 0, - 1 - ] + "default": 0 }, "bt2l": { "type": "integer", "description": "Specify the -L parameter for bowtie2 (length of seed substrings). This will override defaults from alignmode/sensitivity.", "fa_icon": "fas fa-ruler-horizontal", - "help_text": "The length of the seed sub-string to use during seeding. This will override any values set with `--bt2_sensitivity`. Default: 0 (i.e. use`--bt2_sensitivity` defaults: [20 for local and 22 for end-to-end](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line).\n\n> Modifies Bowtie2 parameters: `-L`" + "help_text": "The length of the seed sub-string to use during seeding. This will override any values set with `--bt2_sensitivity`. Default: 0 (i.e. use`--bt2_sensitivity` defaults: [20 for local and 22 for end-to-end](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#command-line).\n\n> Modifies Bowtie2 parameters: `-L`", + "default": 0 }, "bt2_trim5": { "type": "integer", "description": "Specify number of bases to trim off from 5' (left) end of read before alignment.", "fa_icon": "fas fa-cut", - "help_text": "Number of bases to trim at the 5' (left) end of read prior alignment. Maybe useful when left-over sequencing artefacts of in-line barcodes present Default: 0\n\n> Modifies Bowtie2 parameters: `-bt2_trim5`" + "help_text": "Number of bases to trim at the 5' (left) end of read prior alignment. Maybe useful when left-over sequencing artefacts of in-line barcodes present Default: 0\n\n> Modifies Bowtie2 parameters: `-bt2_trim5`", + "default": 0 }, "bt2_trim3": { "type": "integer", "description": "Specify number of bases to trim off from 3' (right) end of read before alignment.", "fa_icon": "fas fa-cut", - "help_text": "Number of bases to trim at the 3' (right) end of read prior alignment. Maybe useful when left-over sequencing artefacts of in-line barcodes present Default: 0.\n\n> Modifies Bowtie2 parameters: `-bt2_trim3`" + "help_text": "Number of bases to trim at the 3' (right) end of read prior alignment. Maybe useful when left-over sequencing artefacts of in-line barcodes present Default: 0.\n\n> Modifies Bowtie2 parameters: `-bt2_trim3`", + "default": 0 }, "bt2_maxins": { "type": "integer", @@ -731,13 +731,15 @@ "type": "integer", "description": "Minimum mapping quality for reads filter.", "fa_icon": "fas fa-greater-than-equal", - "help_text": "Specify a mapping quality threshold for mapped reads to be kept for downstream analysis. By default keeps all reads and is therefore set to `0` (basically doesn't filter anything).\n\n> Modifies samtools view parameter: `-q`" + "help_text": "Specify a mapping quality threshold for mapped reads to be kept for downstream analysis. By default keeps all reads and is therefore set to `0` (basically doesn't filter anything).\n\n> Modifies samtools view parameter: `-q`", + "default": 0 }, "bam_filter_minreadlength": { "type": "integer", "fa_icon": "fas fa-ruler-horizontal", "description": "Specify minimum read length to be kept after mapping.", - "help_text": "Specify minimum length of mapped reads. This filtering will apply at the same time as mapping quality filtering.\n\nIf used _instead_ of minimum length read filtering at AdapterRemoval, this can be useful to get more realistic endogenous DNA percentages, when most of your reads are very short (e.g. in single-stranded libraries) and would otherwise be discarded by AdapterRemoval (thus making an artificially small denominator for a typical endogenous DNA calculation). Note in this context you should not perform mapping quality filtering nor discarding of unmapped reads to ensure a correct denominator of all reads, for the endogenous DNA calculation.\n\n> Modifies filter_bam_fragment_length.py parameter: `-l`" + "help_text": "Specify minimum length of mapped reads. This filtering will apply at the same time as mapping quality filtering.\n\nIf used _instead_ of minimum length read filtering at AdapterRemoval, this can be useful to get more realistic endogenous DNA percentages, when most of your reads are very short (e.g. in single-stranded libraries) and would otherwise be discarded by AdapterRemoval (thus making an artificially small denominator for a typical endogenous DNA calculation). Note in this context you should not perform mapping quality filtering nor discarding of unmapped reads to ensure a correct denominator of all reads, for the endogenous DNA calculation.\n\n> Modifies filter_bam_fragment_length.py parameter: `-l`", + "default": 0 }, "bam_unmapped_type": { "type": "string", @@ -1168,7 +1170,8 @@ "type": "integer", "description": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified in --freebayes_C.", "fa_icon": "fab fa-think-peaks", - "help_text": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified C. Not set by default.\n\n> Modifies freebayes parameter: `-g`" + "help_text": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified C. Not set by default.\n\n> Modifies freebayes parameter: `-g`", + "default": 0 }, "freebayes_p": { "type": "integer", @@ -1683,9 +1686,9 @@ "maltextract_percentidentity": { "type": "number", "description": "Minimum percent identity alignments are required to have to be reported. Recommended to set same as MALT parameter.", - "default": 85.0, + "default": 85, "fa_icon": "fas fa-id-card", - "help_text": "Minimum percent identity alignments are required to have to be reported. Higher values allows fewer mismatches between read and reference sequence, but therefore will provide greater confidence in the hit. Lower values allow more mismatches, which can account for damage and divergence of a related strain/species to the reference. Recommended to set same as MALT parameter or higher. Default: `85.0`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--minPI`" + "help_text": "Minimum percent identity alignments are required to have to be reported. Higher values allows fewer mismatches between read and reference sequence, but therefore will provide greater confidence in the hit. Lower values allow more mismatches, which can account for damage and divergence of a related strain/species to the reference. Recommended to set same as MALT parameter or higher. Default: `85`.\n\nOnly when `--metagenomic_tool malt` is also supplied.\n\n> Modifies MaltExtract parameter: `--minPI`" }, "maltextract_topalignment": { "type": "boolean",