diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 21eddc702..946c9caa1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,7 +20,7 @@ jobs: strategy: matrix: # Nextflow versions: check pipeline minimum and current latest - nxf_ver: ['20.04.0', ''] + nxf_ver: ['20.07.1', ''] steps: - name: Check out pipeline code uses: actions/checkout@v2 @@ -34,13 +34,13 @@ jobs: - name: Build new docker image if: env.MATCHED_FILES - run: docker build --no-cache . -t nfcore/eager:2.3.1 + run: docker build --no-cache . -t nfcore/eager:2.3.2 - name: Pull docker image if: ${{ !env.MATCHED_FILES }} run: | docker pull nfcore/eager:dev - docker tag nfcore/eager:dev nfcore/eager:2.3.1 + docker tag nfcore/eager:dev nfcore/eager:2.3.2 - name: Install Nextflow env: diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e4985f0d..013927858 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,40 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## [2.3.2] - 2021-03-16 + +### `Added` + +- [#687](https://github.com/nf-core/eager/pull/687) - Adds Kraken2 unique kmer counting report +- [#676](https://github.com/nf-core/eager/issues/676) - Refactor help message / summary message formatting to automatic versions using nf-core library +- [#682](https://github.com/nf-core/eager/issues/682) - Add AdapterRemoval `--qualitymax` flag to allow FASTQ Phred score range max more than 41 + +### `Fixed` + +- [#666](https://github.com/nf-core/eager/issues/666) - Fixed input file staging for `print_nuclear_contamination` +- [#631](https://github.com/nf-core/eager/issues/631) - Update minimum Nextflow version to 20.07.1, due to unfortunate bug in Nextflow 20.04.1 causing eager to crash if patch pulled +- Made MultiQC crash behaviour stricter when dealing with large datasets, as reported by @ashildv +- [#652](https://github.com/nf-core/eager/issues/652) - Added note to documentation that when using `--skip_collapse` this will use _paired-end_ alignment mode with mappers when using PE data +- [#626](https://github.com/nf-core/eager/issues/626) - Add additional checks to ensure pipeline will give useful error if cells of a TSV column are empty +- Added note to documentation that when using `--skip_collapse` this will use _paired-end_ alignment mode with mappers when using PE data +- [#673](https://github.com/nf-core/eager/pull/673) - Fix Kraken database loading when loading from directory instead of compressed file +- [#688](https://github.com/nf-core/eager/issues/668) - Allow pipeline to complete, even if Qualimap crashes due to an empty or corrupt BAM file for one sample/library +- [#683](https://github.com/nf-core/eager/pull/683) - Sets `--igenomes_ignore` to true by default, as rarely used by users currently and makes resolving configs less complex +- Added exit code `140` to re-tryable exit code list to account for certain scheduler wall-time limit fails +- [#672](https://github.com/nf-core/eager/issues/672) - Removed java parameter from picard tools which could cause memory issues +- [#679](https://github.com/nf-core/eager/issues/679) - Refactor within-process bash conditions to groovy/nextflow, due to incompatibility with some servers environments +- [#690](https://github.com/nf-core/eager/pull/690) - Fixed ANGSD output mode for beagle by setting `-doMajorMinor 1` as default in that case +- [#693](https://github.com/nf-core/eager/issues/693) - Fixed broken TSV input validation for the Colour Chemistry column +- [#695](https://github.com/nf-core/eager/issues/695) - Fixed incorrect `-profile` order in tutorials (originally written reversed due to [nextflow bug](https://github.com/nextflow-io/nextflow/issues/1792)) +- [#653](https://github.com/nf-core/eager/issues/653) - Fixed file collision errors with sexdeterrmine for two same-named libraries with different strandedness + +### `Dependencies` + +- Bumped MultiQC to 1.10 for improved functionality +- Bumped HOPS to 0.35 for MultiQC 1.10 compatibility + +### `Deprecated` + ## [2.3.1] - 2021-01-14 ### `Added` diff --git a/Dockerfile b/Dockerfile index 653c46c19..773a11a32 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,10 +7,10 @@ COPY environment.yml / RUN conda env create --quiet -f /environment.yml && conda clean -a # Add conda installation dir to PATH (instead of doing 'conda activate') -ENV PATH /opt/conda/envs/nf-core-eager-2.3.1/bin:$PATH +ENV PATH /opt/conda/envs/nf-core-eager-2.3.2/bin:$PATH # Dump the details of the installed packages to a file for posterity -RUN conda env export --name nf-core-eager-2.3.1 > nf-core-eager-2.3.1.yml +RUN conda env export --name nf-core-eager-2.3.2 > nf-core-eager-2.3.2.yml # Instruct R processes to use these empty files instead of clashing with a local version RUN touch .Rprofile diff --git a/README.md b/README.md index cd4cde5ee..43eec0138 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ -# ![nf-core/eager](docs/images/nf-core-eager_logo.png) +# ![nf-core/eager](docs/images/nf-core_eager_logo.png) **A fully reproducible and state-of-the-art ancient DNA analysis pipeline**. [![GitHub Actions CI Status](https://github.com/nf-core/eager/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/eager/actions) [![GitHub Actions Linting Status](https://github.com/nf-core/eager/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/eager/actions) -[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A520.04.0-brightgreen.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A520.07.1-brightgreen.svg)](https://www.nextflow.io/) [![nf-core](https://img.shields.io/badge/nf--core-pipeline-brightgreen.svg)](https://nf-co.re/) [![DOI](https://zenodo.org/badge/135918251.svg)](https://zenodo.org/badge/latestdoi/135918251) @@ -158,7 +158,10 @@ of this pipeline: Those who have provided conceptual guidance, suggestions, bug reports etc. +* [Alexandre Gilardet](https://github.com/alexandregilardet) * Arielle Munters +* [Charles Plessy](https://github.com/charles-plessy) +* [Åshild Vågene](https://github.com/ashildv) * [Hester van Schalkwyk](https://github.com/hesterjvs) * [Ido Bar](https://github.com/IdoBar) * [Irina Velsko](https://github.com/ivelsko) @@ -184,7 +187,8 @@ For further information or help, don't hesitate to get in touch on the [Slack `# ## Citations If you use `nf-core/eager` for your analysis, please cite the `eager` preprint as follows: -> James A. Fellows Yates, Thiseas Christos Lamnidis, Maxime Borry, Aida Andrades Valtueña, Zandra Fagneräs, Stephen Clayton, Maxime U. Garcia, Judith Neukamm, Alexander Peltzer **Reproducible, portable, and efficient ancient genome reconstruction with nf-core/eager** bioRxiv 2020.06.11.145615; [doi: https://doi.org/10.1101/2020.06.11.145615](https://doi.org/10.1101/2020.06.11.145615) + +> Fellows Yates JA, Lamnidis TC, Borry M, Valtueña Andrades A, Fagernäs Z, Clayton S, Garcia MU, Neukamm J, Peltzer A. 2021. Reproducible, portable, and efficient ancient genome reconstruction with nf-core/eager. PeerJ 9:e10947. DOI: [10.7717/peerj.10947](https://doi.org/10.7717/peerj.10947). You can cite the eager zenodo record for a specific version using the following [doi: 10.5281/zenodo.3698082](https://zenodo.org/badge/latestdoi/135918251) diff --git a/bin/kraken_parse.py b/bin/kraken_parse.py index 7fb348c03..3cdd4e213 100755 --- a/bin/kraken_parse.py +++ b/bin/kraken_parse.py @@ -19,18 +19,24 @@ def _get_args(): default=50, help="Minimum number of hits on clade to report it. Default = 50") parser.add_argument( - '-o', - dest="output", + '-or', + dest="readout", default=None, - help="Output file. Default = .kraken_parsed.csv") + help="Read count output file. Default = .read_kraken_parsed.csv") + parser.add_argument( + '-ok', + dest="kmerout", + default=None, + help="Kmer Output file. Default = .kmer_kraken_parsed.csv") args = parser.parse_args() infile = args.krakenReport countlim = int(args.count) - outfile = args.output + readout = args.readout + kmerout = args.kmerout - return(infile, countlim, outfile) + return(infile, countlim, readout, kmerout) def _get_basename(file_name): @@ -51,14 +57,23 @@ def parse_kraken(infile, countlim): ''' with open(infile, 'r') as f: - resdict = {} + read_dict = {} + kmer_dict = {} csvreader = csv.reader(f, delimiter='\t') for line in csvreader: reads = int(line[1]) if reads >= countlim: - taxid = line[4] - resdict[taxid] = reads - return(resdict) + taxid = line[6] + kmer = line[3] + unique_kmer = line[4] + try: + kmer_duplicity = float(kmer)/float(unique_kmer) + except ZeroDivisionError: + kmer_duplicity = 0 + read_dict[taxid] = reads + kmer_dict[taxid] = kmer_duplicity + + return(read_dict, kmer_dict) def write_output(resdict, infile, outfile): @@ -70,10 +85,17 @@ def write_output(resdict, infile, outfile): if __name__ == '__main__': - INFILE, COUNTLIM, outfile = _get_args() + INFILE, COUNTLIM, readout, kmerout = _get_args() - if not outfile: - outfile = _get_basename(INFILE)+".kraken_parsed.csv" + if not readout: + read_outfile = _get_basename(INFILE)+".read_kraken_parsed.csv" + else: + read_outfile = readout + if not kmerout: + kmer_outfile = _get_basename(INFILE)+".kmer_kraken_parsed.csv" + else: + kmer_outfile = kmerout - tmp_dict = parse_kraken(infile=INFILE, countlim=COUNTLIM) - write_output(resdict=tmp_dict, infile=INFILE, outfile=outfile) + read_dict, kmer_dict = parse_kraken(infile=INFILE, countlim=COUNTLIM) + write_output(resdict=read_dict, infile=INFILE, outfile=read_outfile) + write_output(resdict=kmer_dict, infile=INFILE, outfile=kmer_outfile) diff --git a/bin/merge_kraken_res.py b/bin/merge_kraken_res.py index c6c16e4c1..53e45a63d 100755 --- a/bin/merge_kraken_res.py +++ b/bin/merge_kraken_res.py @@ -15,21 +15,29 @@ def _get_args(): formatter_class=argparse.RawDescriptionHelpFormatter, description='Merging csv count files in one table') parser.add_argument( - '-o', - dest="output", - default="kraken_count_table.csv", - help="Output file. Default = kraken_count_table.csv") + '-or', + dest="readout", + default="kraken_read_count_table.csv", + help="Read count output file. Default = kraken_read_count_table.csv") + parser.add_argument( + '-ok', + dest="kmerout", + default="kraken_kmer_unicity_table.csv", + help="Kmer unicity output file. Default = kraken_kmer_unicity_table.csv") args = parser.parse_args() - outfile = args.output + readout = args.readout + kmerout = args.kmerout - return(outfile) + return(readout, kmerout) def get_csv(): tmp = [i for i in os.listdir() if ".csv" in i] - return(tmp) + kmer = [i for i in tmp if '.kmer_' in i] + read = [i for i in tmp if '.read_' in i] + return(read, kmer) def _get_basename(file_name): @@ -54,8 +62,9 @@ def write_csv(pd_dataframe, outfile): if __name__ == "__main__": - OUTFILE = _get_args() - all_csv = get_csv() - resdf = merge_csv(all_csv) - write_csv(resdf, OUTFILE) - print(resdf) + READOUT, KMEROUT = _get_args() + reads, kmers = get_csv() + read_df = merge_csv(reads) + kmer_df = merge_csv(kmers) + write_csv(read_df, READOUT) + write_csv(kmer_df, KMEROUT) \ No newline at end of file diff --git a/conf/base.config b/conf/base.config index 8266b9c72..5cfa9ba3e 100644 --- a/conf/base.config +++ b/conf/base.config @@ -14,7 +14,7 @@ process { memory = { check_max( 7.GB * task.attempt, 'memory' ) } time = { check_max( 24.h * task.attempt, 'time' ) } - errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } + errorStrategy = { task.exitStatus in [143,137,104,134,139, 140] ? 'retry' : 'finish' } maxRetries = 3 maxErrors = '-1' @@ -74,7 +74,7 @@ process { } withName:qualimap{ - errorStrategy = { task.exitStatus in [1,143,137,104,134,139] ? 'retry' : 'finish' } + errorStrategy = { task.exitStatus in [1,143,137,104,134,139, 140] ? 'retry' : task.exitStatus in [255] ? 'ignore' : 'finish' } } withName:preseq { @@ -82,30 +82,26 @@ process { } withName:damageprofiler { - errorStrategy = { task.exitStatus in [1,143,137,104,134,139] ? 'retry' : 'finish' } + errorStrategy = { task.exitStatus in [1,143,137,104,134,139, 140] ? 'retry' : 'finish' } } // Add 1 retry for certain java tools as not enough heap space java errors gives exit code 1 withName: dedup { - errorStrategy = { task.exitStatus in [1,143,137,104,134,139] ? 'retry' : 'finish' } + errorStrategy = { task.exitStatus in [1,143,137,104,134,139, 140] ? 'retry' : 'finish' } } withName: markduplicates { - errorStrategy = { task.exitStatus in [143,137] ? 'retry' : 'finish' } + errorStrategy = { task.exitStatus in [143,137, 140] ? 'retry' : 'finish' } } // Add 1 retry as not enough heapspace java error gives exit code 1 withName: malt { - errorStrategy = { task.exitStatus in [1,143,137,104,134,139] ? 'retry' : 'finish' } + errorStrategy = { task.exitStatus in [1,143,137,104,134,139, 140] ? 'retry' : 'finish' } } // other process specific exit statuses withName: nuclear_contamination { - errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'ignore' : 'retry' } - } - - withName: multiqc { - errorStrategy = { task.exitStatus in [143,137] ? 'retry' : 'ignore' } + errorStrategy = { task.exitStatus in [143,137,104,134,139, 140] ? 'ignore' : 'retry' } } } diff --git a/docs/images/tutorials/profiles/config_profile_inheritence.png b/docs/images/tutorials/profiles/config_profile_inheritence.png index 756ac6df8..f76f9dbab 100644 Binary files a/docs/images/tutorials/profiles/config_profile_inheritence.png and b/docs/images/tutorials/profiles/config_profile_inheritence.png differ diff --git a/docs/images/tutorials/profiles/config_profile_inheritence.svg b/docs/images/tutorials/profiles/config_profile_inheritence.svg index 4759eac8b..9b5a7b224 100644 --- a/docs/images/tutorials/profiles/config_profile_inheritence.svg +++ b/docs/images/tutorials/profiles/config_profile_inheritence.svg @@ -8,7 +8,7 @@ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" sodipodi:docname="config_profile_inheritence.svg" - inkscape:version="1.0 (1.0+r73+1)" + inkscape:version="1.0.2 (1.0.2+r75+1)" id="svg8" version="1.1" viewBox="0 0 203.72916 254" @@ -864,9 +864,9 @@ fit-margin-left="0" fit-margin-top="0" inkscape:window-maximized="1" - inkscape:window-y="0" - inkscape:window-x="0" - inkscape:window-height="1043" + inkscape:window-y="1107" + inkscape:window-x="1920" + inkscape:window-height="1016" inkscape:window-width="1920" inkscape:bbox-nodes="true" inkscape:snap-bbox="true" @@ -874,9 +874,9 @@ inkscape:document-rotation="0" inkscape:current-layer="g1826" inkscape:document-units="mm" - inkscape:cy="428.58865" - inkscape:cx="370.88851" - inkscape:zoom="0.7" + inkscape:cy="925.64833" + inkscape:cx="430.04375" + inkscape:zoom="0.98994949" inkscape:pageshadow="2" inkscape:pageopacity="0.0" borderopacity="1.0" @@ -1374,7 +1374,7 @@ -profile b,a + style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:'Ubuntu Mono';-inkscape-font-specification:'Ubuntu Mono'">-profile a,b -c new.conf \ -profile c,b,a + y="32.525859">-profile a,b,c Note that in nf-core/eager this will be run on single- and double-stranded variants of the same library _separately_. This can also help assess for differential contamination between libraries. + #### Relative Coverage Theoretically, males are expected to cluster around (0.5, 0.5) in the produced scatter plot, while females are expected to cluster around (1.0, 0.0). In practice, when analysing ancient DNA, these relative coverage on both axes is slightly lower than expected, and individuals can cluster around (0.45, 0.45) and (0.85, 0.05). As the number of covered sites for an individual gets smaller, the confidence on the estimate becomes lower, because it is increasingly more likely to be affected by randomness in the preservation and sequencing of ancient DNA. @@ -667,7 +671,11 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir - `metagenomic_complexity_filter` - this contains the output from filtering of input reads to metagenomic classification of low-sequence complexity reads as performed by `bbduk`. This will include the filtered FASTQ files (`*_lowcomplexityremoved.fq.gz`) and also the run-time log (`_bbduk.stats`) for each sample. **Note:** there are no sections in the MultiQC report for this module, therefore you must check the `._bbduk.stats` files to get summary statistics of the filtering. - `metagenomic_classification/` - this contains the output for a given metagenomic classifier. - Running MALT will contain RMA6 files that can be loaded into MEGAN6 or MaltExtract for phylogenetic visualisation of read taxonomic assignments and aDNA characteristics respectively. Additional a `malt.log` file is provided which gives additional information such as run-time, memory usage and per-sample statistics of numbers of alignments with taxonomic assignment etc. This will also include gzip SAM files if requested. - - Running kraken will contain the Kraken output and report files, as well as a merged Taxon count table. + - Running kraken will contain the Kraken output and report files, as well as a merged Taxon count table. You will also get a Kraken kmer duplication table, in a [KrakenUniq](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1568-0) fashion. This is very useful to check for breadth of coverage and detect read stacking. A small number of aligned reads (low coverage) and a kmer duplication >1 is usually a sign of read stacking, usually indicative of a false positive hit (e.g. from over-amplified libraries). *Kmer duplication is defined as: number of kmers / number of unique kmers*. You will find two kraken reports formats available: + - the `*.kreport` which is the old report format, without distinct minimizer count information, used by some tools such as [Pavian](https://github.com/fbreitwieser/pavian) + - the `*.kraken2_report` which is the new kraken report format, with the distinct minimizer count information. + + Finally, the `*.kraken.out` file are the direct output of Kraken2 - `maltextract/` - this contains a `results` directory in which contains the output from MaltExtract - typically one folder for each filter type, an error and a log file. The characteristics of each node (e.g. damage, read lengths, edit distances - each in different txt formats) can be seen in each sub-folder of the filter folders. Output can be visualised either with the [HOPS postprocessing script](https://github.com/rhuebler/HOPS) or [MEx-IPA](https://github.com/jfy133/MEx-IPA) - `consensus_sequence/` - this contains three FASTA files from VCF2Genome of a consensus sequence based on the reference FASTA with each sample's unique modifications. The main FASTA is a standard file with bases not passing the specified thresholds as Ns. The two other FASTAS (`_refmod.fasta.gz`) and (`_uncertainity.fasta.gz`) are IUPAC uncertainty codes (rather than Ns) and a special number-based uncertainty system used for other downstream tools, respectively. - `librarymerged_bams/` - these contain the final BAM files that would go into genotyping (if genotyping is turned on). This means the files will contain all libraries of a given sample (including trimmed non-UDG or half-UDG treated libraries, if BAM trimming turned on) diff --git a/docs/usage.md b/docs/usage.md index 7f0f4a6e3..7d13ff440 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -413,6 +413,14 @@ they are unique (e.g. if one library was sequenced on Lane 8 of two HiSeq runs, specify lanes as 8 and 16 for each FASTQ file respectively). For library merging errors, you must modify your `Library_ID`s accordingly, to make them unique. +### A library or sample is missing in my MultiQC report + +In some cases it maybe no output log is produced by a particular tool for MultiQC. Therefore this sample will not be displayed. + +Known cases include: + +- Qualimap: there will be no MultiQC output if the BAM file is empty. An empty BAM file is produced when no reads map to the reference and causes Qualimap to crash - this is crash is ignored by nf-core/eager (to allow the rest of the pipeline to continue) and will therefore have no log file for that particular sample/library + ## Tutorials ### Tutorial - How to investigate a failed run @@ -563,7 +571,7 @@ If it does, please ask the developer of the tool (although we will endeavour to help as much as we can via the [nf-core slack](https://nf-co.re/join/slack) in the #eager channel). -### Tutorial - What are Profiles and How To Use Them +### Tutorial - What are profiles and how to use them #### Tutorial Profiles - Background @@ -606,7 +614,7 @@ profile called 'old_dna'. We will have run our pipeline with the following command ```bash -nextflow run nf-core/eager -c old_dna_profile.config -profile old_dna,hpc_blue <...> +nextflow run nf-core/eager -c old_dna_profile.config -profile hpc_blue,old_dna <...> ``` Then our colleague wished to recreate your results. As long as the @@ -614,7 +622,7 @@ Then our colleague wished to recreate your results. As long as the same pipeline settings but on their own cluster HPC 'purple'. ```bash -nextflow run nf-core/eager -c old_dna_profile.config -profile old_dna,hpc_purple <...> +nextflow run nf-core/eager -c old_dna_profile.config -profile hpc_purple,old_dna <...> ``` (where the `old_dna` profile is defined in `old_dna_profile.config`, and @@ -632,27 +640,27 @@ understanding 'inheritance' of profiles when specifying multiple profiles, when using `nextflow run`. When specifying multiple profiles, parameters defined in the profile in the -first position will overwrite those in the second, and everything defined in the -first and second will overwrite everything in a third. +first position will be overwritten by those in the second, and everything defined in the +first and second will be overwritten everything in a third. This can be illustrated as follows. ```bash - overwrites overwrites - ┌──────┐ ┌──────┐ - │ ▼ │ ▼ --profile my_paper,cluster,institution + overwrites overwrites + ┌──────┐ ┌──────┐ + ▼ │ ▼ │ +-profile institution,cluster,my_paper ``` This would be translated as follows. If your parameters looked like the following -Parameter | Resolved Parameters | my_paper | cluster | institution -----------------|------------------------|------------|----------|------------ ---executor | singularity | \ | \ | singularity ---max_memory | 256GB | \ | 256GB | 756GB ---bwa_aln | 0.1 | 0.1 | 0.01 | \ +Parameter | Resolved Parameters | institution | cluster | my_paper +----------------|------------------------|-------------|----------|---------- +--executor | singularity | singularity | \ | \ +--max_memory | 256GB | 756GB | 256GB | \ +--bwa_aln | 0.1 | \ | 0.01 | 0.1 (where '\' is a parameter not defined in a given profile.) @@ -667,10 +675,7 @@ defined in the `cluster` profile. ##### Tutorial Profiles - Configuration Files > :warning: This section is only needed for users that want to set up -> institutional-level profiles. - -
Expand to view -

+> institutional-level profiles. Otherwise please skip to [Writing your own profile](#tutorial-profiles---writing-your-own-profile) In actuality, a nf-core/eager run already contains many configs and profiles, and will normally use *multiple* configs profiles in a single run. Multiple @@ -706,7 +711,7 @@ Then running the pipeline with the profiles in the order of the following run command: ```bash -nextflow run nf-core/eager -c old_dna_profile.config -profile old_dna,hpc_blue <...> +nextflow run nf-core/eager -c old_dna_profile.config -profile hpc_blue,old_dna <...> ``` In the background, any parameters in the pipeline's `nextflow.config` @@ -789,13 +794,13 @@ profiles { If you run with `nextflow run -profile shh` to specify to use an institutional-level nf-core config, the parameters will be read as `--bwaalnn -0.04` and `--bwaalnl 32` as these are the defaults 'fall back' params as +0.04` and `--bwaalnl 32` as these are the default 'fall back' params as indicated in the example above. -If you specify as `nextflow run -profile pathogen_loose,shh`, as expected +If you specify as `nextflow run -profile shh,pathogen_loose`, as expected Nextflow will resolve the two parameters as `0.01` and `16`. -Importantly however, if you specify `-profile shh,pathogen_loose` the +Importantly however, if you specify `-profile pathogen_loose,shh` the `pathogen_loose` **profile** will **still** take precedence over just the 'global' params. @@ -807,9 +812,6 @@ This is also described in the Nextflow documentation This is because selecting a `profile` will always take precedence over the values specified in a config file, but outside of a profile. -

-
- #### Tutorial Profiles - Writing your own profile We will now provide an example of how to write, use and share a project specific @@ -924,14 +926,14 @@ For example, Aida (Andrades Valtueña) on her cluster `sdag` at the MPI-SHH (`shh`) in Jena could run the following: ```bash -nextflow run nf-core/eager -c ///AndradesValtuena2018.config -profile AndradesValtuena2018,sdag,shh --input '////' <...> +nextflow run nf-core/eager -c ///AndradesValtuena2018.config -profile shh,sdag,AndradesValtuena2018 --input '////' <...> ``` Then a colleague at a different institution, such as the SciLifeLab, could run the same profile on the UPPMAX cluster in Uppsala with: ```bash -nextflow run nf-core/eager -c ///AndradesValtuena2018.config -profile AndradesValtuena2018,uppmax --input '////' <...> +nextflow run nf-core/eager -c ///AndradesValtuena2018.config -profile uppmax,AndradesValtuena2018 --input '////' <...> ``` And that's all there is to it. Of course you should always check that there are @@ -1016,7 +1018,7 @@ running. ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_preprocessing20200727' \ <...> ``` @@ -1080,7 +1082,7 @@ FASTA file and the corresponding indices. ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_preprocessing20200727' \ --input 'preprocessing20200727.tsv' \ --fasta '../Reference/genome/hs37d5.fa' \ @@ -1105,7 +1107,7 @@ directory (which contains 'intermediate' working files and directories). ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_preprocessing20200727' \ --input 'preprocessing20200727.tsv' \` --fasta '../Reference/genome/hs37d5.fa' \ @@ -1134,7 +1136,7 @@ string to be clipped. ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_preprocessing20200727' \ --input 'preprocessing20200727.tsv' \ --fasta '../Reference/genome/hs37d5.fa' \ @@ -1159,7 +1161,7 @@ with `--dedupper`. ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_preprocessing20200727' \ --input 'preprocessing20200727.tsv' \ --fasta '../Reference/genome/hs37d5.fa' \ @@ -1184,7 +1186,7 @@ and the reference. ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_preprocessing20200727' \ --input 'preprocessing20200727.tsv' \ --fasta '../Reference/genome/hs37d5.fa' \ @@ -1211,7 +1213,7 @@ unmapped reads. ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_preprocessing20200727' \ --input 'preprocessing20200727.tsv' \ --fasta '../Reference/genome/hs37d5.fa' \ @@ -1241,7 +1243,7 @@ fragment. We will therefore use `--bamutils_clip_half_udg_left` and ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_preprocessing20200727' \ --input 'preprocessing20200727.tsv' \ --fasta '../Reference/genome/hs37d5.fa' \ @@ -1277,7 +1279,7 @@ you can download the file from [here](https://github.com/nf-core/test-datasets/b ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_preprocessing20200727' \ --input 'preprocessing20200727.tsv' \ --fasta '../Reference/genome/hs37d5.fa' \ @@ -1311,7 +1313,7 @@ is simply named 'X'. ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_preprocessing20200727' \ --input 'preprocessing20200727.tsv' \ --fasta '../Reference/genome/hs37d5.fa' \ @@ -1352,7 +1354,7 @@ providing the name of the mitochondrial DNA contig in our reference genome with ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_preprocessing20200727' \ --input 'preprocessing20200727.tsv' \ --fasta '../Reference/genome/hs37d5.fa' \ @@ -1394,7 +1396,7 @@ file of these sites that is specified with `--pileupcaller_snpfile`. ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_preprocessing20200727' \ --input 'preprocessing20200727.tsv' \ --fasta '../Reference/genome/hs37d5.fa' \ @@ -1636,7 +1638,7 @@ running. ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_screening20200720' \ <...> ``` @@ -1700,7 +1702,7 @@ FASTA file and the corresponding indices. ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_screening20200720' \ --input 'screening20200720.tsv' \ --fasta '../Reference/genome/GRCh38.fa' \ @@ -1725,7 +1727,7 @@ directory (which contains 'intermediate' working files and directories). ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_screening20200720' \ --input 'screening20200720.tsv' \ --fasta '../Reference/genome/GRCh38.fa' \ @@ -1754,7 +1756,7 @@ string to be clipped. ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_screening20200720' \ --input 'screening20200720.tsv' \ --fasta '../Reference/genome/GRCh38.fa' \ @@ -1775,7 +1777,7 @@ tell nf-core/eager what to do with the off target reads from the mapping. ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_screening20200720' \ --input 'screening20200720.tsv' \ --fasta '../Reference/genome/GRCh38.fa' \ @@ -1805,7 +1807,7 @@ documentation describing each parameters can be seen in the usage ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_screening20200720' \ --input 'screening20200720.tsv' \ --fasta '../Reference/genome/GRCh38.fa' \ @@ -1832,7 +1834,7 @@ have indicators of true aDNA, we will run 'maltExtract' of the ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_screening20200720' \ --input 'screening20200720.tsv' \ --fasta '../Reference/genome/GRCh38.fa' \ @@ -2103,7 +2105,7 @@ running. ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_preprocessing20200727' \ <...> ``` @@ -2164,7 +2166,7 @@ FASTA file and the corresponding indices. ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_preprocessing20200727' \ --input 'preprocessing20200727.tsv' \ --fasta '../Reference/genome/Yersinia_pestis_C092_GCF_000009065.1_ASM906v1.fa' \ @@ -2189,7 +2191,7 @@ directory (which contains 'intermediate' working files and directories). ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_preprocessing20200727' \ --input 'preprocessing20200727.tsv' \ --fasta '../Reference/genome/Yersinia_pestis_C092_GCF_000009065.1_ASM906v1.fa' \ @@ -2218,7 +2220,7 @@ the default minimum length of a poly-G string to be clipped. ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_preprocessing20200727' \ --input 'preprocessing20200727.tsv' \ --fasta '../Reference/genome/Yersinia_pestis_C092_GCF_000009065.1_ASM906v1.fa' \ @@ -2242,7 +2244,7 @@ will do this with `--bwaalnn` and `--bwaalnl` respectively. ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_preprocessing20200727' \ --input 'preprocessing20200727.tsv' \ --fasta '../Reference/genome/Yersinia_pestis_C092_GCF_000009065.1_ASM906v1.fa' \ @@ -2266,7 +2268,7 @@ hard-drive footprint. ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_preprocessing20200727' \ --input 'preprocessing20200727.tsv' \ --fasta '../Reference/genome/Yersinia_pestis_C092_GCF_000009065.1_ASM906v1.fa' \ @@ -2296,7 +2298,7 @@ clarity. ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_preprocessing20200727' \ --input 'preprocessing20200727.tsv' \ --fasta '../Reference/genome/Yersinia_pestis_C092_GCF_000009065.1_ASM906v1.fa' \ @@ -2327,7 +2329,7 @@ often a custom BED file with just genes of interest is recommended. Furthermore ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_preprocessing20200727' \ --input 'preprocessing20200727.tsv' \ --fasta '../Reference/genome/Yersinia_pestis_C092_GCF_000009065.1_ASM906v1.fa' \ @@ -2365,7 +2367,7 @@ we do BAM trimming instead here as another demonstration of functionality. ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_preprocessing20200727' \ --input 'preprocessing20200727.tsv' \ --fasta '../Reference/genome/Yersinia_pestis_C092_GCF_000009065.1_ASM906v1.fa' \ @@ -2406,7 +2408,7 @@ need to specify that we want to use the trimmed bams from the previous step. ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_preprocessing20200727' \ --input 'preprocessing20200727.tsv' \ --fasta '../Reference/genome/Yersinia_pestis_C092_GCF_000009065.1_ASM906v1.fa' \ @@ -2449,7 +2451,7 @@ same settings and reference genome. We can do this as follows. ```bash nextflow run nf-core/eager \ -r 2.2.0 \ --profile sdag,shh,singularity \ +-profile singularity,shh,sdag \ -name 'projectX_preprocessing20200727' \ --input 'preprocessing20200727.tsv' \ --fasta '../Reference/genome/Yersinia_pestis_C092_GCF_000009065.1_ASM906v1.fa' \ diff --git a/environment.yml b/environment.yml index e971624c5..3475472e0 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,6 @@ # You can use this file to create a conda environment for this pipeline: # conda env create -f environment.yml -name: nf-core-eager-2.3.1 +name: nf-core-eager-2.3.2 channels: - conda-forge - bioconda @@ -26,7 +26,7 @@ dependencies: - bioconda::qualimap=2.2.2d - bioconda::vcf2genome=0.91 - bioconda::damageprofiler=0.4.9 # Don't upgrade - later versions don't allow java 8 - - bioconda::multiqc=1.9 + - bioconda::multiqc=1.10 - bioconda::pmdtools=0.60 - bioconda::bedtools=2.29.2 - conda-forge::libiconv=1.15 @@ -37,16 +37,17 @@ dependencies: - bioconda::bamutil=1.0.14 - bioconda::mtnucratio=0.7 - bioconda::pysam=0.15.4 #Says python3.7 or less - - bioconda::kraken2=2.0.9beta + - bioconda::kraken2=2.1.1 - conda-forge::pandas=1.0.4 #.4 is python3.8+ compatible - bioconda::freebayes=1.3.2 #should be fine with python 3.8, but says <3.7 on webpage - bioconda::sexdeterrmine=1.1.2 - bioconda::multivcfanalyzer=0.85.2 - - bioconda::hops=0.34 + - bioconda::hops=0.35 - conda-forge::biopython=1.76 - conda-forge::xopen=0.9.0 - bioconda::bowtie2=2.4.1 - bioconda::eigenstratdatabasetools=1.0.2 - bioconda::mapdamage2=2.2.0 - bioconda::bbmap=38.87 + - conda-forge::tbb=2020.2 # temp for bioconda broken bowtie2, remove once patched in bioconda diff --git a/lib/Checks.groovy b/lib/Checks.groovy new file mode 100644 index 000000000..4f804ec01 --- /dev/null +++ b/lib/Checks.groovy @@ -0,0 +1,85 @@ +import org.yaml.snakeyaml.Yaml + +/* + * This file holds several functions used to perform standard checks for the nf-core pipeline template. + */ + +class Checks { + + static void check_conda_channels(log) { + Yaml parser = new Yaml() + def channels = [] + try { + def config = parser.load("conda config --show channels".execute().text) + channels = config.channels + } catch(NullPointerException | IOException e) { + log.warn "Could not verify conda channel configuration." + return + } + + // Check that all channels are present + def required_channels = ['conda-forge', 'bioconda', 'defaults'] + def conda_check_failed = !required_channels.every { ch -> ch in channels } + + // Check that they are in the right order + conda_check_failed |= !(channels.indexOf('conda-forge') < channels.indexOf('bioconda')) + conda_check_failed |= !(channels.indexOf('bioconda') < channels.indexOf('defaults')) + + if (conda_check_failed) { + log.warn "=============================================================================\n" + + " There is a problem with your Conda configuration!\n\n" + + " You will need to set-up the conda-forge and bioconda channels correctly.\n" + + " Please refer to https://bioconda.github.io/user/install.html#set-up-channels\n" + + " NB: The order of the channels matters!\n" + + "===================================================================================" + } + } + + static void aws_batch(workflow, params) { + if (workflow.profile.contains('awsbatch')) { + assert (params.awsqueue && params.awsregion) : "Specify correct --awsqueue and --awsregion parameters on AWSBatch!" + // Check outdir paths to be S3 buckets if running on AWSBatch + // related: https://github.com/nextflow-io/nextflow/issues/813 + assert params.outdir.startsWith('s3:') : "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!" + // Prevent trace files to be stored on S3 since S3 does not support rolling files. + assert !params.tracedir.startsWith('s3:') : "Specify a local tracedir or run without trace! S3 cannot be used for tracefiles." + } + } + + static void hostname(workflow, params, log) { + Map colors = Headers.log_colours(params.monochrome_logs) + if (params.hostnames) { + def hostname = "hostname".execute().text.trim() + params.hostnames.each { prof, hnames -> + hnames.each { hname -> + if (hostname.contains(hname) && !workflow.profile.contains(prof)) { + log.info "=${colors.yellow}====================================================${colors.reset}=\n" + + "${colors.yellow}WARN: You are running with `-profile $workflow.profile`\n" + + " but your machine hostname is ${colors.white}'$hostname'${colors.reset}.\n" + + " ${colors.yellow_bold}Please use `-profile $prof${colors.reset}`\n" + + "=${colors.yellow}====================================================${colors.reset}=" + } + } + } + } + } + + // Citation string + private static String citation(workflow) { + return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + + "* The pipeline\n" + + " https://doi.org/10.1101/2020.06.11.145615\n\n" + + "* The nf-core framework\n" + + " https://dx.doi.org/10.1038/s41587-020-0439-x\n" + + " https://rdcu.be/b1GjZ\n\n" + + "* Software dependencies\n" + + " https://github.com/${workflow.manifest.name}/blob/master/CITATIONS.md" + } + + + + + + + +} diff --git a/lib/Completion.groovy b/lib/Completion.groovy new file mode 100644 index 000000000..2348fa440 --- /dev/null +++ b/lib/Completion.groovy @@ -0,0 +1,129 @@ +/* + * Functions to be run on completion of pipeline + */ + +class Completion { + static void email(workflow, params, summary_params, projectDir, log, multiqc_report=[]) { + + // Set up the e-mail variables + def subject = "[$workflow.manifest.name] Successful: $workflow.runName" + + if (!workflow.success) { + subject = "[$workflow.manifest.name] FAILED: $workflow.runName" + } + + def summary = [:] + for (group in summary_params.keySet()) { + summary << summary_params[group] + } + + def misc_fields = [:] + misc_fields['Date Started'] = workflow.start + misc_fields['Date Completed'] = workflow.complete + misc_fields['Pipeline script file path'] = workflow.scriptFile + misc_fields['Pipeline script hash ID'] = workflow.scriptId + if (workflow.repository) misc_fields['Pipeline repository Git URL'] = workflow.repository + if (workflow.commitId) misc_fields['Pipeline repository Git Commit'] = workflow.commitId + if (workflow.revision) misc_fields['Pipeline Git branch/tag'] = workflow.revision + misc_fields['Nextflow Version'] = workflow.nextflow.version + misc_fields['Nextflow Build'] = workflow.nextflow.build + misc_fields['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp + + def email_fields = [:] + email_fields['version'] = workflow.manifest.version + email_fields['runName'] = workflow.runName + email_fields['success'] = workflow.success + email_fields['dateComplete'] = workflow.complete + email_fields['duration'] = workflow.duration + email_fields['exitStatus'] = workflow.exitStatus + email_fields['errorMessage'] = (workflow.errorMessage ?: 'None') + email_fields['errorReport'] = (workflow.errorReport ?: 'None') + email_fields['commandLine'] = workflow.commandLine + email_fields['projectDir'] = workflow.projectDir + email_fields['summary'] = summary << misc_fields + + // On success try attach the multiqc report + def mqc_report = null + try { + if (workflow.success) { + mqc_report = multiqc_report.getVal() + if (mqc_report.getClass() == ArrayList && mqc_report.size() >= 1) { + if (mqc_report.size() > 1) { + log.warn "[$workflow.manifest.name] Found multiple reports from process 'MULTIQC', will use only one" + } + mqc_report = mqc_report[0] + } + } + } catch (all) { + log.warn "[$workflow.manifest.name] Could not attach MultiQC report to summary email" + } + + // Check if we are only sending emails on failure + def email_address = params.email + if (!params.email && params.email_on_fail && !workflow.success) { + email_address = params.email_on_fail + } + + // Render the TXT template + def engine = new groovy.text.GStringTemplateEngine() + def tf = new File("$projectDir/assets/email_template.txt") + def txt_template = engine.createTemplate(tf).make(email_fields) + def email_txt = txt_template.toString() + + // Render the HTML template + def hf = new File("$projectDir/assets/email_template.html") + def html_template = engine.createTemplate(hf).make(email_fields) + def email_html = html_template.toString() + + // Render the sendmail template + def max_multiqc_email_size = params.max_multiqc_email_size as nextflow.util.MemoryUnit + def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: max_multiqc_email_size.toBytes()] + def sf = new File("$projectDir/assets/sendmail_template.txt") + def sendmail_template = engine.createTemplate(sf).make(smail_fields) + def sendmail_html = sendmail_template.toString() + + // Send the HTML e-mail + Map colors = Headers.log_colours(params.monochrome_logs) + if (email_address) { + try { + if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } + // Try to send HTML e-mail using sendmail + [ 'sendmail', '-t' ].execute() << sendmail_html + log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (sendmail)-" + } catch (all) { + // Catch failures and try with plaintext + def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ] + if ( mqc_report.size() <= max_multiqc_email_size.toBytes() ) { + mail_cmd += [ '-A', mqc_report ] + } + mail_cmd.execute() << email_html + log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (mail)-" + } + } + + // Write summary e-mail HTML to a file + def output_d = new File("${params.outdir}/pipeline_info/") + if (!output_d.exists()) { + output_d.mkdirs() + } + def output_hf = new File(output_d, "pipeline_report.html") + output_hf.withWriter { w -> w << email_html } + def output_tf = new File(output_d, "pipeline_report.txt") + output_tf.withWriter { w -> w << email_txt } + } + + static void summary(workflow, params, log, fail_percent_mapped=[:], pass_percent_mapped=[:]) { + Map colors = Headers.log_colours(params.monochrome_logs) + + if (workflow.success) { + if (workflow.stats.ignoredCount == 0) { + log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Pipeline completed successfully${colors.reset}-" + } else { + log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed successfully, but with errored process(es) ${colors.reset}-" + } + } else { + Checks.hostname(workflow, params, log) + log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed with errors${colors.reset}-" + } + } +} diff --git a/lib/Headers.groovy b/lib/Headers.groovy new file mode 100644 index 000000000..15d1d3880 --- /dev/null +++ b/lib/Headers.groovy @@ -0,0 +1,43 @@ +/* + * This file holds several functions used to render the nf-core ANSI header. + */ + +class Headers { + + private static Map log_colours(Boolean monochrome_logs) { + Map colorcodes = [:] + colorcodes['reset'] = monochrome_logs ? '' : "\033[0m" + colorcodes['dim'] = monochrome_logs ? '' : "\033[2m" + colorcodes['black'] = monochrome_logs ? '' : "\033[0;30m" + colorcodes['green'] = monochrome_logs ? '' : "\033[0;32m" + colorcodes['yellow'] = monochrome_logs ? '' : "\033[0;33m" + colorcodes['yellow_bold'] = monochrome_logs ? '' : "\033[1;93m" + colorcodes['blue'] = monochrome_logs ? '' : "\033[0;34m" + colorcodes['purple'] = monochrome_logs ? '' : "\033[0;35m" + colorcodes['cyan'] = monochrome_logs ? '' : "\033[0;36m" + colorcodes['white'] = monochrome_logs ? '' : "\033[0;37m" + colorcodes['red'] = monochrome_logs ? '' : "\033[1;91m" + return colorcodes + } + + static String dashed_line(monochrome_logs) { + Map colors = log_colours(monochrome_logs) + return "-${colors.dim}----------------------------------------------------${colors.reset}-" + } + + static String nf_core(workflow, monochrome_logs) { + Map colors = log_colours(monochrome_logs) + String.format( + """\n + ${dashed_line(monochrome_logs)} + ${colors.green},--.${colors.black}/${colors.green},-.${colors.reset} + ${colors.blue} ___ __ __ __ ___ ${colors.green}/,-._.--~\'${colors.reset} + ${colors.blue} |\\ | |__ __ / ` / \\ |__) |__ ${colors.yellow}} {${colors.reset} + ${colors.blue} | \\| | \\__, \\__/ | \\ |___ ${colors.green}\\`-._,-`-,${colors.reset} + ${colors.green}`._,._,\'${colors.reset} + ${colors.purple} ${workflow.manifest.name} v${workflow.manifest.version}${colors.reset} + ${dashed_line(monochrome_logs)} + """.stripIndent() + ) + } +} diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy new file mode 100644 index 000000000..42e32dc1c --- /dev/null +++ b/lib/NfcoreSchema.groovy @@ -0,0 +1,549 @@ +/* + * This file holds several functions used to perform JSON parameter validation, help and summary rendering for the nf-core pipeline template. + */ + +import org.everit.json.schema.Schema +import org.everit.json.schema.loader.SchemaLoader +import org.everit.json.schema.ValidationException +import org.json.JSONObject +import org.json.JSONTokener +import org.json.JSONArray +import groovy.json.JsonSlurper +import groovy.json.JsonBuilder + +class NfcoreSchema { + + /* + * Function to loop over all parameters defined in schema and check + * whether the given paremeters adhere to the specificiations + */ + /* groovylint-disable-next-line UnusedPrivateMethodParameter */ + private static ArrayList validateParameters(params, jsonSchema, log) { + def has_error = false + //=====================================================================// + // Check for nextflow core params and unexpected params + def json = new File(jsonSchema).text + def Map schemaParams = (Map) new JsonSlurper().parseText(json).get('definitions') + def specifiedParamKeys = params.keySet() + def nf_params = [ + // Options for base `nextflow` command + 'bg', + 'c', + 'C', + 'config', + 'd', + 'D', + 'dockerize', + 'h', + 'log', + 'q', + 'quiet', + 'syslog', + 'v', + 'version', + + // Options for `nextflow run` command + 'ansi', + 'ansi-log', + 'bg', + 'bucket-dir', + 'c', + 'cache', + 'config', + 'dsl2', + 'dump-channels', + 'dump-hashes', + 'E', + 'entry', + 'latest', + 'lib', + 'main-script', + 'N', + 'name', + 'offline', + 'params-file', + 'pi', + 'plugins', + 'poll-interval', + 'pool-size', + 'profile', + 'ps', + 'qs', + 'queue-size', + 'r', + 'resume', + 'revision', + 'stdin', + 'stub', + 'stub-run', + 'test', + 'w', + 'with-charliecloud', + 'with-conda', + 'with-dag', + 'with-docker', + 'with-mpi', + 'with-notification', + 'with-podman', + 'with-report', + 'with-singularity', + 'with-timeline', + 'with-tower', + 'with-trace', + 'with-weblog', + 'without-docker', + 'without-podman', + 'work-dir' + ] + def unexpectedParams = [] + + // Collect expected parameters from the schema + def expectedParams = [] + for (group in schemaParams) { + for (p in group.value['properties']) { + expectedParams.push(p.key) + } + } + + for (specifiedParam in specifiedParamKeys) { + // nextflow params + if (nf_params.contains(specifiedParam)) { + log.error "ERROR: You used a core Nextflow option with two hyphens: '--${specifiedParam}'. Please resubmit with '-${specifiedParam}'" + has_error = true + } + // unexpected params + def params_ignore = params.schema_ignore_params.split(',') + 'schema_ignore_params' + if (!expectedParams.contains(specifiedParam) && !params_ignore.contains(specifiedParam)) { + unexpectedParams.push(specifiedParam) + } + } + + //=====================================================================// + // Validate parameters against the schema + InputStream inputStream = new File(jsonSchema).newInputStream() + JSONObject rawSchema = new JSONObject(new JSONTokener(inputStream)) + Schema schema = SchemaLoader.load(rawSchema) + + // Clean the parameters + def cleanedParams = cleanParameters(params) + + // Convert to JSONObject + def jsonParams = new JsonBuilder(cleanedParams) + JSONObject paramsJSON = new JSONObject(jsonParams.toString()) + + // Validate + try { + schema.validate(paramsJSON) + } catch (ValidationException e) { + println '' + log.error 'ERROR: Validation of pipeline parameters failed!' + JSONObject exceptionJSON = e.toJSON() + printExceptions(exceptionJSON, paramsJSON, log) + println '' + has_error = true + } + + // Check for unexpected parameters + // Getting this message a lot for parameters that you *do* expect? + // You can make a csv list of expected params not in the schema with 'params.schema_ignore_params' + // for example, in your institutional config + if (unexpectedParams.size() > 0) { + Map colors = log_colours(params.monochrome_logs) + println '' + def warn_msg = 'Found unexpected parameters:' + for (unexpectedParam in unexpectedParams) { + warn_msg = warn_msg + "\n* --${unexpectedParam}: ${paramsJSON[unexpectedParam].toString()}" + } + log.warn warn_msg + log.info "- ${colors.dim}(Hide this message with 'params.schema_ignore_params')${colors.reset} -" + println '' + } + + if (has_error) { + System.exit(1) + } + + return unexpectedParams + } + + // Loop over nested exceptions and print the causingException + private static void printExceptions(exJSON, paramsJSON, log) { + def causingExceptions = exJSON['causingExceptions'] + if (causingExceptions.length() == 0) { + def m = exJSON['message'] =~ /required key \[([^\]]+)\] not found/ + // Missing required param + if (m.matches()) { + log.error "* Missing required parameter: --${m[0][1]}" + } + // Other base-level error + else if (exJSON['pointerToViolation'] == '#') { + log.error "* ${exJSON['message']}" + } + // Error with specific param + else { + def param = exJSON['pointerToViolation'] - ~/^#\// + def param_val = paramsJSON[param].toString() + log.error "* --${param}: ${exJSON['message']} (${param_val})" + } + } + for (ex in causingExceptions) { + printExceptions(ex, paramsJSON, log) + } + } + + private static Map cleanParameters(params) { + def new_params = params.getClass().newInstance(params) + for (p in params) { + // remove anything evaluating to false + if (!p['value']) { + new_params.remove(p.key) + } + // Cast MemoryUnit to String + if (p['value'].getClass() == nextflow.util.MemoryUnit) { + new_params.replace(p.key, p['value'].toString()) + } + // Cast Duration to String + if (p['value'].getClass() == nextflow.util.Duration) { + new_params.replace(p.key, p['value'].toString()) + } + // Cast LinkedHashMap to String + if (p['value'].getClass() == LinkedHashMap) { + new_params.replace(p.key, p['value'].toString()) + } + } + return new_params + } + + /* + * This method tries to read a JSON params file + */ + private static LinkedHashMap params_load(String json_schema) { + def params_map = new LinkedHashMap() + try { + params_map = params_read(json_schema) + } catch (Exception e) { + println "Could not read parameters settings from JSON. $e" + params_map = new LinkedHashMap() + } + return params_map + } + + private static Map log_colours(Boolean monochrome_logs) { + Map colorcodes = [:] + + // Reset / Meta + colorcodes['reset'] = monochrome_logs ? '' : "\033[0m" + colorcodes['bold'] = monochrome_logs ? '' : "\033[1m" + colorcodes['dim'] = monochrome_logs ? '' : "\033[2m" + colorcodes['underlined'] = monochrome_logs ? '' : "\033[4m" + colorcodes['blink'] = monochrome_logs ? '' : "\033[5m" + colorcodes['reverse'] = monochrome_logs ? '' : "\033[7m" + colorcodes['hidden'] = monochrome_logs ? '' : "\033[8m" + + // Regular Colors + colorcodes['black'] = monochrome_logs ? '' : "\033[0;30m" + colorcodes['red'] = monochrome_logs ? '' : "\033[0;31m" + colorcodes['green'] = monochrome_logs ? '' : "\033[0;32m" + colorcodes['yellow'] = monochrome_logs ? '' : "\033[0;33m" + colorcodes['blue'] = monochrome_logs ? '' : "\033[0;34m" + colorcodes['purple'] = monochrome_logs ? '' : "\033[0;35m" + colorcodes['cyan'] = monochrome_logs ? '' : "\033[0;36m" + colorcodes['white'] = monochrome_logs ? '' : "\033[0;37m" + + // Bold + colorcodes['bblack'] = monochrome_logs ? '' : "\033[1;30m" + colorcodes['bred'] = monochrome_logs ? '' : "\033[1;31m" + colorcodes['bgreen'] = monochrome_logs ? '' : "\033[1;32m" + colorcodes['byellow'] = monochrome_logs ? '' : "\033[1;33m" + colorcodes['bblue'] = monochrome_logs ? '' : "\033[1;34m" + colorcodes['bpurple'] = monochrome_logs ? '' : "\033[1;35m" + colorcodes['bcyan'] = monochrome_logs ? '' : "\033[1;36m" + colorcodes['bwhite'] = monochrome_logs ? '' : "\033[1;37m" + + // Underline + colorcodes['ublack'] = monochrome_logs ? '' : "\033[4;30m" + colorcodes['ured'] = monochrome_logs ? '' : "\033[4;31m" + colorcodes['ugreen'] = monochrome_logs ? '' : "\033[4;32m" + colorcodes['uyellow'] = monochrome_logs ? '' : "\033[4;33m" + colorcodes['ublue'] = monochrome_logs ? '' : "\033[4;34m" + colorcodes['upurple'] = monochrome_logs ? '' : "\033[4;35m" + colorcodes['ucyan'] = monochrome_logs ? '' : "\033[4;36m" + colorcodes['uwhite'] = monochrome_logs ? '' : "\033[4;37m" + + // High Intensity + colorcodes['iblack'] = monochrome_logs ? '' : "\033[0;90m" + colorcodes['ired'] = monochrome_logs ? '' : "\033[0;91m" + colorcodes['igreen'] = monochrome_logs ? '' : "\033[0;92m" + colorcodes['iyellow'] = monochrome_logs ? '' : "\033[0;93m" + colorcodes['iblue'] = monochrome_logs ? '' : "\033[0;94m" + colorcodes['ipurple'] = monochrome_logs ? '' : "\033[0;95m" + colorcodes['icyan'] = monochrome_logs ? '' : "\033[0;96m" + colorcodes['iwhite'] = monochrome_logs ? '' : "\033[0;97m" + + // Bold High Intensity + colorcodes['biblack'] = monochrome_logs ? '' : "\033[1;90m" + colorcodes['bired'] = monochrome_logs ? '' : "\033[1;91m" + colorcodes['bigreen'] = monochrome_logs ? '' : "\033[1;92m" + colorcodes['biyellow'] = monochrome_logs ? '' : "\033[1;93m" + colorcodes['biblue'] = monochrome_logs ? '' : "\033[1;94m" + colorcodes['bipurple'] = monochrome_logs ? '' : "\033[1;95m" + colorcodes['bicyan'] = monochrome_logs ? '' : "\033[1;96m" + colorcodes['biwhite'] = monochrome_logs ? '' : "\033[1;97m" + + return colorcodes + } + + static String dashed_line(monochrome_logs) { + Map colors = log_colours(monochrome_logs) + return "-${colors.dim}----------------------------------------------------${colors.reset}-" + } + + /* + Method to actually read in JSON file using Groovy. + Group (as Key), values are all parameters + - Parameter1 as Key, Description as Value + - Parameter2 as Key, Description as Value + .... + Group + - + */ + private static LinkedHashMap params_read(String json_schema) throws Exception { + def json = new File(json_schema).text + def Map schema_definitions = (Map) new JsonSlurper().parseText(json).get('definitions') + def Map schema_properties = (Map) new JsonSlurper().parseText(json).get('properties') /* Tree looks like this in nf-core schema + * definitions <- this is what the first get('definitions') gets us + group 1 + title + description + properties + parameter 1 + type + description + parameter 2 + type + description + group 2 + title + description + properties + parameter 1 + type + description + */ + def params_map = new LinkedHashMap() + schema_definitions.each { key, val -> + def Map group = schema_definitions."$key".properties // Gets the property object of the group + def title = schema_definitions."$key".title + def sub_params = new LinkedHashMap() + group.each { innerkey, value -> + sub_params.put(innerkey, value) + } + params_map.put(title, sub_params) + } + + // Ungrouped params + def ungrouped_params = new LinkedHashMap() + schema_properties.each { innerkey, value -> + ungrouped_params.put(innerkey, value) + } + params_map.put("Other parameters", ungrouped_params) + + return params_map + } + + /* + * Get maximum number of characters across all parameter names + */ + private static Integer params_max_chars(params_map) { + Integer max_chars = 0 + for (group in params_map.keySet()) { + def group_params = params_map.get(group) // This gets the parameters of that particular group + for (param in group_params.keySet()) { + if (param.size() > max_chars) { + max_chars = param.size() + } + } + } + return max_chars + } + + /* + * Beautify parameters for --help + */ + private static String params_help(workflow, params, json_schema, command) { + Map colors = log_colours(params.monochrome_logs) + Integer num_hidden = 0 + String output = '' + output += 'Typical pipeline command:\n\n' + output += " ${colors.cyan}${command}${colors.reset}\n\n" + Map params_map = params_load(json_schema) + Integer max_chars = params_max_chars(params_map) + 1 + Integer desc_indent = max_chars + 14 + Integer dec_linewidth = 160 - desc_indent + for (group in params_map.keySet()) { + Integer num_params = 0 + String group_output = colors.underlined + colors.bold + group + colors.reset + '\n' + def group_params = params_map.get(group) // This gets the parameters of that particular group + for (param in group_params.keySet()) { + if (group_params.get(param).hidden && !params.show_hidden_params) { + num_hidden += 1 + continue; + } + def type = '[' + group_params.get(param).type + ']' + def description = group_params.get(param).description + def defaultValue = group_params.get(param).default ? " [default: " + group_params.get(param).default.toString() + "]" : '' + def description_default = description + colors.dim + defaultValue + colors.reset + // Wrap long description texts + // Loosely based on https://dzone.com/articles/groovy-plain-text-word-wrap + if (description_default.length() > dec_linewidth){ + List olines = [] + String oline = "" // " " * indent + description_default.split(" ").each() { wrd -> + if ((oline.size() + wrd.size()) <= dec_linewidth) { + oline += wrd + " " + } else { + olines += oline + oline = wrd + " " + } + } + olines += oline + description_default = olines.join("\n" + " " * desc_indent) + } + group_output += " --" + param.padRight(max_chars) + colors.dim + type.padRight(10) + colors.reset + description_default + '\n' + num_params += 1 + } + group_output += '\n' + if (num_params > 0){ + output += group_output + } + } + output += dashed_line(params.monochrome_logs) + if (num_hidden > 0){ + output += colors.dim + "\n Hiding $num_hidden params, use --show_hidden_params to show.\n" + colors.reset + output += dashed_line(params.monochrome_logs) + } + return output + } + + /* + * Groovy Map summarising parameters/workflow options used by the pipeline + */ + private static LinkedHashMap params_summary_map(workflow, params, json_schema) { + // Get a selection of core Nextflow workflow options + def Map workflow_summary = [:] + if (workflow.revision) { + workflow_summary['revision'] = workflow.revision + } + workflow_summary['runName'] = workflow.runName + if (workflow.containerEngine) { + workflow_summary['containerEngine'] = "$workflow.containerEngine" + } + if (workflow.container) { + workflow_summary['container'] = "$workflow.container" + } + workflow_summary['launchDir'] = workflow.launchDir + workflow_summary['workDir'] = workflow.workDir + workflow_summary['projectDir'] = workflow.projectDir + workflow_summary['userName'] = workflow.userName + workflow_summary['profile'] = workflow.profile + workflow_summary['configFiles'] = workflow.configFiles.join(', ') + + // Get pipeline parameters defined in JSON Schema + def Map params_summary = [:] + def blacklist = ['hostnames'] + def params_map = params_load(json_schema) + for (group in params_map.keySet()) { + def sub_params = new LinkedHashMap() + def group_params = params_map.get(group) // This gets the parameters of that particular group + for (param in group_params.keySet()) { + if (params.containsKey(param) && !blacklist.contains(param)) { + def params_value = params.get(param) + def schema_value = group_params.get(param).default + def param_type = group_params.get(param).type + if (schema_value == null) { + if (param_type == 'boolean') { + schema_value = false + } + if (param_type == 'string') { + schema_value = '' + } + if (param_type == 'integer') { + schema_value = 0 + } + } else { + if (param_type == 'string') { + if (schema_value.contains('$projectDir') || schema_value.contains('${projectDir}')) { + def sub_string = schema_value.replace('\$projectDir', '') + sub_string = sub_string.replace('\${projectDir}', '') + if (params_value.contains(sub_string)) { + schema_value = params_value + } + } + if (schema_value.contains('$params.outdir') || schema_value.contains('${params.outdir}')) { + def sub_string = schema_value.replace('\$params.outdir', '') + sub_string = sub_string.replace('\${params.outdir}', '') + if ("${params.outdir}${sub_string}" == params_value) { + schema_value = params_value + } + } + } + } + + if (params_value != schema_value) { + sub_params.put("$param", params_value) + } + } + } + params_summary.put(group, sub_params) + } + return [ 'Core Nextflow options' : workflow_summary ] << params_summary + } + + /* + * Beautify parameters for summary and return as string + */ + private static String params_summary_log(workflow, params, json_schema) { + String output = '' + def params_map = params_summary_map(workflow, params, json_schema) + def max_chars = params_max_chars(params_map) + for (group in params_map.keySet()) { + def group_params = params_map.get(group) // This gets the parameters of that particular group + if (group_params) { + output += group + '\n' + for (param in group_params.keySet()) { + output += " \u001B[1m" + param.padRight(max_chars) + ": \u001B[1m" + group_params.get(param) + '\n' + } + output += '\n' + } + } + output += "[Only displaying parameters that differ from pipeline default]\n" + output += dashed_line(params.monochrome_logs) + output += '\n\n' + dashed_line(params.monochrome_logs) + return output + } + + static String params_summary_multiqc(workflow, summary) { + String summary_section = '' + for (group in summary.keySet()) { + def group_params = summary.get(group) // This gets the parameters of that particular group + if (group_params) { + summary_section += "

$group

\n" + summary_section += "
\n" + for (param in group_params.keySet()) { + summary_section += "
$param
${group_params.get(param) ?: 'N/A'}
\n" + } + summary_section += "
\n" + } + } + + String yaml_file_text = "id: '${workflow.manifest.name.replace('/','-')}-summary'\n" + yaml_file_text += "description: ' - this information is collected when the pipeline is started.'\n" + yaml_file_text += "section_name: '${workflow.manifest.name} Workflow Summary'\n" + yaml_file_text += "section_href: 'https://github.com/${workflow.manifest.name}'\n" + yaml_file_text += "plot_type: 'html'\n" + yaml_file_text += "data: |\n" + yaml_file_text += "${summary_section}" + return yaml_file_text + } + +} diff --git a/lib/nfcore_external_java_deps.jar b/lib/nfcore_external_java_deps.jar new file mode 100644 index 000000000..805c8bb5e Binary files /dev/null and b/lib/nfcore_external_java_deps.jar differ diff --git a/main.nf b/main.nf index 88ebe9e4d..f3ad4b2a0 100644 --- a/main.nf +++ b/main.nf @@ -1,268 +1,47 @@ #!/usr/bin/env nextflow /* -============================================================================================================ +------------------------------------------------------------------------------------------------------------ nf-core/eager -============================================================================================================ +------------------------------------------------------------------------------------------------------------ EAGER Analysis Pipeline. Started 2018-06-05 #### Homepage / Documentation https://github.com/nf-core/eager #### Authors For a list of authors and contributors, see: https://github.com/nf-core/eager/tree/dev#authors-alphabetical -============================================================================================================ +------------------------------------------------------------------------------------------------------------ */ -def helpMessage() { - log.info nfcoreHeader() - log.info""" - ========================================= - eager v${workflow.manifest.version} - ========================================= - Usage: - - The typical command for running the pipeline is as follows: - - nextflow run nf-core/eager -profile --reads'*_R{1,2}.fastq.gz' --fasta '.fasta' - - Mandatory arguments: - -profile [str] Configuration profile to use. Can use multiple (comma separated). Ask system administrator if unsure. - Available: conda, docker, singularity, test, awsbatch, and more - Input - --input [file] Either paths or URLs to FASTQ/BAM data (must be surrounded with quotes). Indicate multiple files with wildcards (*). For paired end data, the path must use '{1,2}' notation to specify read pairs. - OR - A path to a TSV file (ending .tsv) containing file paths and sequencing/sample metadata. Allows for merging of multiple lanes/libraries/samples. Please see documentation for template. - - --udg_type [str] Specify here if you have UDG treated libraries, Set to 'half' for partial treatment, or 'full' for UDG. If not set, libraries are assumed to have no UDG treatment ('none'). Not required for TSV input. Default: ${params.udg_type} - --single_stranded [bool] Specifies that libraries are single stranded. Only effects MaltExtract and genotyping pileupCaller. Not required for TSV input. - --single_end [bool] Specifies that the input is single end reads. Not required for TSV input. - --colour_chemistry [num] Specifies what Illumina sequencing chemistry was used. Used to inform whether to poly-G trim if turned on (see below). Not required for TSV input. Options: 2, 4. Default: ${params.colour_chemistry} - --bam [bool] Specifies that the input is in BAM format. Not required for TSV input. - - - Additional Options: - --snpcapture_bed [file] If library result of SNP capture, path to BED file containing SNPs positions on reference genome. - --run_convertinputbam [bool] Turns on conversion of an input BAM file into FASTQ format to allow re-preprocessing (e.g. AdapterRemoval etc.). - - References - --fasta [file] Path or URL to a FASTA reference file (required if not iGenome reference). File suffixes can be: '.fa', '.fn', '.fna', '.fasta' - --genome [str] Name of iGenomes reference (required if not FASTA reference). - --bwa_index [dir] Path to directory containing pre-made BWA indices (i.e. everything before the endings '.amb' '.ann' '.bwt'. Most likely the same path as --fasta). If not supplied will be made for you. - --bt2_index [dir] Path to directory containing pre-made Bowtie2 indices (i.e. everything before the endings e.g. '.1.bt2', '.2.bt2', '.rev.1.bt2'. Most likely the same value as --fasta). If not supplied will be made for you. - --fasta_index [file] Path to samtools FASTA index (typically ending in '.fai'). - --seq_dict [file] Path to picard sequence dictionary file (typically ending in '.dict'). - --large_ref [bool] Specify to generate more recent '.csi' BAM indices. If your reference genome is larger than 3.5GB, this is recommended due to more efficient data handling with the '.csi' format over the older '.bai'. - --save_reference [bool] Turns on saving reference genome indices for later re-usage. - - Output options: - --outdir [dir] The output directory where the results will be saved. Default: ${params.outdir} - -w [dir] The directory where intermediate files will be stored. Recommended: '/work/' - - Skipping Skip any of the mentioned steps. - --skip_fastqc [bool] Skips both pre- and post-Adapter Removal FastQC steps. - --skip_adapterremoval [bool] - --skip_preseq [bool] - --skip_deduplication [bool] - --skip_damage_calculation [bool] - --skip_qualimap [bool] - - Complexity Filtering - --complexity_filter_poly_g [bool] Turn on running poly-G removal on FASTQ files. Will only be performed on 2 colour chemistry machine sequenced libraries. - --complexity_filter_poly_g_min [num] Specify length of poly-g min for clipping to be performed. Default: ${params.complexity_filter_poly_g_min} - - Clipping / Merging - --clip_forward_adaptor [str] Specify adapter sequence to be clipped off (forward strand). Default: '${params.clip_forward_adaptor}' - --clip_reverse_adaptor [str] Specify adapter sequence to be clipped off (reverse strand). Default: '${params.clip_reverse_adaptor}' - --clip_readlength [num] Specify read minimum length to be kept for downstream analysis. Default: ${params.clip_readlength} - --clip_min_read_quality [num] Specify minimum base quality for trimming off bases. Default: ${params.clip_min_read_quality} - --min_adap_overlap [num] Specify minimum adapter overlap: Default: ${params.min_adap_overlap} - --skip_collapse [bool] Skip merging forward and reverse reads together. Only applicable for paired-end libraries. - --skip_trim [bool] Skip adapter and quality trimming - --preserve5p [bool] Skip 5p quality base trimming (n, score, window) of 5 prime end. - --mergedonly [bool] Only use merged reads downstream (un-merged reads and singletons are discarded). - - Mapping - --mapper [str] Specify which mapper to use. Options: 'bwaaln', 'bwamem', 'circularmapper', 'bowtie2'. Default: '${params.mapper}' - --bwaalnn [num] Specify the -n parameter for BWA aln, i.e. amount of allowed mismatches in alignments. Default: ${params.bwaalnn} - --bwaalnk [num] Specify the -k parameter for BWA aln, i.e. maximum edit distance allowed in a seed. Default: ${params.bwaalnk} - --bwaalnl [num] Specify the -l parameter for BWA aln, i.e. length of seeds to be used. Set to 1024 for whole read. Default: ${params.bwaalnl} - --circularextension [num] Specify the number of bases to extend reference by (circularmapper only). Default: ${params.circularextension} - --circulartarget [chr] Specify the FASTA header of the target chromosome to extend(circularmapper only). Default: '${params.circulartarget}' - --circularfilter [bool] Turn on to remove reads that did not map to the circularised genome (circularmapper only). - --bt2_alignmode [str] Specify the bowtie2 alignment mode. Options: 'local', 'end-to-end'. Default: '${params.bt2_alignmode}' - --bt2_sensitivity [str] Specify the level of sensitivity for the bowtie2 alignment mode. Options: 'no-preset', 'very-fast', 'fast', 'sensitive', 'very-sensitive'. Default: '${params.bt2_sensitivity}' - --bt2n [num] Specify the -N parameter for bowtie2 (mismatches in seed). This will override defaults from alignmode/sensitivity. Default: ${params.bt2n} - --bt2l [num] Specify the -L parameter for bowtie2 (length of seed substrings). This will override defaults from alignmode/sensitivity. Default: ${params.bt2l} - --bt2_trim5 [num] Specify number of bases to trim off from 5' (left) end of read before alignment. Default: ${params.bt2_trim5} - --bt2_trim3 [num] Specify number of bases to trim off from 3' (right) end of read before alignment. Default: ${params.bt2_trim3} - - Host removal - --hostremoval_input_fastq [bool] Turn on creating pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data) - --hostremoval_mode [str] Host DNA Removal mode. Remove mapped reads completely from FASTQ (remove) or just mask mapped reads sequence by N (replace). Default: '${params.hostremoval_mode}' - - BAM Filtering - --run_bam_filtering [bool] Turn on filtering of mapping quality, read lengths, or unmapped reads of BAM files. - --bam_mapping_quality_threshold [num] Minimum mapping quality for reads filter. Default: ${params.bam_mapping_quality_threshold} - --bam_filter_minreadlength [num] Specify minimum read length to be kept after mapping. - --bam_unmapped_type [str] Defines whether to discard all unmapped reads, keep both mapped and unmapped together, or save as bam and/or only fastq format Options: 'discard', 'bam', 'keep', 'fastq', 'both'. Default: '${params.bam_unmapped_type}' - - DeDuplication - --dedupper [str] Deduplication method to use. Options: 'markduplicates', 'dedup'. Default: '${params.dedupper}' - --dedup_all_merged [bool] Turn on treating all reads as merged reads. - - Library Complexity Estimation - --preseq_step_size [num] Specify the step size of Preseq. Default: ${params.preseq_step_size} - - (aDNA) Damage Analysis - --damageprofiler_length [num] Specify length filter for DamageProfiler. Default: ${params.damageprofiler_length} - --damageprofiler_threshold [num] Specify number of bases of each read to consider for DamageProfiler calculations. Default: ${params.damageprofiler_threshold} - --damageprofiler_yaxis [float] Specify the maximum misincorporation frequency that should be displayed on damage plot. Set to 0 to 'autoscale'. Default: ${params.damageprofiler_yaxis} - --run_mapdamage_rescaling Turn on damage rescaling of BAM files using mapDamage2 to probabilistically remove damage. - --rescale_length_5p Length of read for mapDamage2 to rescale from 5p end. Default: ${params.rescale_length_5p} - --rescale_length_3p Length of read for mapDamage2 to rescale from 5p end. Default: ${params.rescale_length_3p} - --run_pmdtools [bool] Turn on PMDtools - --pmdtools_range [num] Specify range of bases for PMDTools. Default: ${params.pmdtools_range} - --pmdtools_threshold [num] Specify PMDScore threshold for PMDTools. Default: ${params.pmdtools_threshold} - --pmdtools_reference_mask [file] Specify a path to reference mask for PMDTools. - --pmdtools_max_reads [num] Specify the maximum number of reads to consider for metrics generation. Default: ${params.pmdtools_max_reads} - - Annotation Statistics - --run_bedtools_coverage [bool] Turn on ability to calculate no. reads, depth and breadth coverage of features in reference. - --anno_file [file] Path to GFF or BED file containing positions of features in reference file (--fasta). Path should be enclosed in quotes. - - BAM Trimming - --run_trim_bam [bool] Turn on BAM trimming. Will only run on full-UDG or half-UDG libraries. - --bamutils_clip_half_udg_left [num] Specify the number of bases to clip off reads from 'left' end of read for half-UDG libraries. Default: ${params.bamutils_clip_half_udg_left} - --bamutils_clip_half_udg_right [num] Specify the number of bases to clip off reads from 'right' end of read for half-UDG libraries. Default: ${params.bamutils_clip_half_udg_right} - --bamutils_clip_none_udg_left [num] Specify the number of bases to clip off reads from 'left' end of read for non-UDG libraries. Default: ${params.bamutils_clip_none_udg_left} - --bamutils_clip_none_udg_right [num] Specify the number of bases to clip off reads from 'right' end of read for non-UDG libraries. Default: ${params.bamutils_clip_none_udg_right} - --bamutils_softclip [bool] Turn on using softclip instead of hard masking. - - Genotyping - --run_genotyping [bool] Turn on genotyping of BAM files. - --genotyping_tool [str] Specify which genotyper to use either GATK UnifiedGenotyper, GATK HaplotypeCaller, Freebayes, or pileupCaller. Options: 'ug', 'hc', 'freebayes', 'pileupcaller', 'angsd'. - --genotyping_source [str] Specify which input BAM to use for genotyping. Options: 'raw', 'trimmed', 'pmd', 'rescaled'. Default: '${params.genotyping_source}' - --gatk_call_conf [num] Specify GATK phred-scaled confidence threshold. Default: ${params.gatk_call_conf} - --gatk_ploidy [num] Specify GATK organism ploidy. Default: ${params.gatk_ploidy} - --gatk_downsample [num] Maximum depth coverage allowed for genotyping before down-sampling is turned on. Default: ${params.gatk_downsample} - --gatk_dbsnp [file] Specify VCF file for output VCF SNP annotation. Optional. Gzip not accepted. - --gatk_hc_out_mode [str] Specify GATK output mode. Options: 'EMIT_VARIANTS_ONLY', 'EMIT_ALL_CONFIDENT_SITES', 'EMIT_ALL_ACTIVE_SITES'. Default: '${params.gatk_hc_out_mode}' - --gatk_hc_emitrefconf [str] Specify HaplotypeCaller mode for emitting reference confidence calls . Options: 'NONE', 'BP_RESOLUTION', 'GVCF'. Default: '${params.gatk_hc_emitrefconf}' - --gatk_ug_out_mode [str] Specify GATK output mode. Options: 'EMIT_VARIANTS_ONLY', 'EMIT_ALL_CONFIDENT_SITES', 'EMIT_ALL_SITES'. Default: '${params.gatk_ug_out_mode}' - --gatk_ug_genotype_model [str] Specify UnifiedGenotyper likelihood model. Options: 'SNP', 'INDEL', 'BOTH', 'GENERALPLOIDYSNP', 'GENERALPLOIDYINDEL'. Default: '${params.gatk_ug_genotype_model}' - --gatk_ug_keep_realign_bam [bool] Specify to keep the BAM output of re-alignment around variants from GATK UnifiedGenotyper. - --gatk_ug_defaultbasequalities [num] Supply a default base quality if a read is missing a base quality score. Setting to -1 turns this off. - --freebayes_C [num] Specify minimum required supporting observations to consider a variant. Default: ${params.freebayes_C} - --freebayes_g [num] Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified in --freebayes_C. Default: ${params.freebayes_g} - --freebayes_p [num] Specify ploidy of sample in FreeBayes. Default: ${params.freebayes_p} - --pileupcaller_bedfile [file] Specify path to SNP panel in bed format for pileupCaller. - --pileupcaller_snpfile [file] Specify path to SNP panel in EIGENSTRAT format for pileupCaller. - --pileupcaller_method [str] Specify calling method to use. Options: 'randomHaploid', 'randomDiploid', 'majorityCall'. Default: '${params.pileupcaller_method}' - --pileupcaller_transitions_mode [str] Specify the calling mode for transitions. Options: 'AllSites', 'TransitionsMissing', 'SkipTransitions'. Default: '${params.pileupcaller_transitions_mode}' - --angsd_glmodel [str] Specify which ANGSD genotyping likelihood model to use. Options: 'samtools', 'gatk', 'soapsnp', 'syk'. Default: '${params.angsd_glmodel}' - --angsd_glformat [str] Specify which output type to output ANGSD genotyping likelihood results: Options: 'text', 'binary', 'binary_three', 'beagle'. Default: '${params.angsd_glformat}' - --angsd_createfasta [bool] Turn on creation of FASTA from ANGSD genotyping likelhoood. - --angsd_fastamethod [str] Specify which genotype type of 'base calling' to use for ANGSD FASTA generation. Options: 'random', 'common'. Default: '${params.angsd_fastamethod}' - - Consensus Sequence Generation - --run_vcf2genome [bool] Turns on ability to create a consensus sequence FASTA file based on a UnifiedGenotyper VCF file and the original reference (only considers SNPs). - --vcf2genome_outfile [str] Specify name of the output FASTA file containing the consensus sequence. Do not include `.vcf` in the file name. Default: '' - --vcf2genome_header [str] Specify the header name of the consensus sequence entry within the FASTA file. Default: '' - --vcf2genome_minc [num] Minimum depth coverage required for a call to be included (else N will be called). Default: ${params.vcf2genome_minc} - --vcf2genome_minq [num] Minimum genotyping quality of a call to be called. Else N will be called. Default: ${params.vcf2genome_minq} - --vcf2genome_minfreq [float] Minimum fraction of reads supporting a call to be included. Else N will be called. Default: ${params.vcf2genome_minfreq} - - SNP Table Generation - --run_multivcfanalyzer [bool] Turn on MultiVCFAnalyzer. Note: This currently only supports diploid GATK UnifiedGenotyper input. - --write_allele_frequencies [bool] Turn on writing write allele frequencies in the SNP table. - --min_genotype_quality [num] Specify the minimum genotyping quality threshold for a SNP to be called. Default: ${params.min_genotype_quality} - --min_base_coverage [num] Specify the minimum number of reads a position needs to be covered to be considered for base calling. Default: ${params.min_base_coverage} - --min_allele_freq_hom [float] Specify the minimum allele frequency that a base requires to be considered a 'homozygous' call. Default: ${params.min_allele_freq_hom} - --min_allele_freq_het [float] Specify the minimum allele frequency that a base requires to be considered a 'heterozygous' call. Default: ${params.min_allele_freq_het} - --additional_vcf_files [file] Specify paths to additional pre-made VCF files to be included in the SNP table generation. Use wildcard(s) for multiple files. Optional. - --reference_gff_annotations [file] Specify path to the reference genome annotations in '.gff' format. Optional. - --reference_gff_exclude [file] Specify path to the positions to be excluded in '.gff' format. Optional. - --snp_eff_results [file] Specify path to the output file from SNP effect analysis in '.txt' format. Optional. - - Mitochondrial to Nuclear Ratio - --run_mtnucratio [bool] Turn on mitochondrial to nuclear ratio calculation. - --mtnucratio_header [str] Specify the name of the reference FASTA entry corresponding to the mitochondrial genome (up to the first space). Default: '${params.mtnucratio_header}' - - Sex Determination - --run_sexdeterrmine [bool] Turn on sex determination for human reference genomes. - --sexdeterrmine_bedfile [file] Specify path to SNP panel in bed format for error bar calculation. Optional (see documentation). - - Nuclear Contamination for Human DNA - --run_nuclear_contamination [bool] Turn on nuclear contamination estimation for human reference genomes. - --contamination_chrom_name [str] The name of the X chromosome in your bam or FASTA header. 'X' for hs37d5, 'chrX' for HG19. Default: '${params.contamination_chrom_name}' - - Metagenomic Screening - --metagenomic_complexity_filter Turn on removal of low-sequence complexity reads for metagenomic screening with bbduk. - --metagenomic_complexity_entropy Specify the entropy threshold that under which a sequencing read will be complexity filtered out. This should be between 0-1. Default: '${params.metagenomic_complexity_entropy}' - --run_metagenomic_screening [bool] Turn on metagenomic screening module for reference-unmapped reads - --metagenomic_tool [str] Specify which classifier to use. Options: 'malt', 'kraken'. Default: '${params.contamination_chrom_name}' - --database [dir] Specify path to classifer database directory. For Kraken2 this can also be a `.tar.gz` of the directory. - --metagenomic_min_support_reads [num] Specify a minimum number of reads a taxon of sample total is required to have to be retained. Not compatible with . Default: ${params.metagenomic_min_support_reads} - --percent_identity [num] Percent identity value threshold for MALT. Default: ${params.percent_identity} - --malt_mode [str] Specify which alignment method to use for MALT. Options: 'Unknown', 'BlastN', 'BlastP', 'BlastX', 'Classifier'. Default: '${params.malt_mode}' - --malt_alignment_mode [str] Specify alignment method for MALT. Options: 'Local', 'SemiGlobal'. Default: '${params.malt_alignment_mode}' - --malt_top_percent [num] Specify the percent for LCA algorithm for MALT (see MEGAN6 CE manual). Default: ${params.malt_top_percent} - --malt_min_support_mode [str] Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained for MALT. Options: 'percent', 'reads'. Default: '${params.malt_min_support_mode}' - --malt_min_support_percent [num] Specify the minimum percentage of reads a taxon of sample total is required to have to be retained for MALT. Default: Default: ${params.malt_min_support_percent} - --malt_max_queries [num] Specify the maximium number of queries a read can have for MALT. Default: ${params.malt_max_queries} - --malt_memory_mode [str] Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow. Options: 'load', 'page', 'map'. Default: '${params.malt_memory_mode}' - --malt_sam_output [bool] Specify to also produce SAM alignment files. Note this includes both aligned and unaligned reads, and are gzipped. Note this will result in very large file sizes. - - Metagenomic Authentication - --run_maltextract [bool] Turn on MaltExtract for MALT aDNA characteristics authentication - --maltextract_taxon_list [file] Path to a txt file with taxa of interest (one taxon per row, NCBI taxonomy name format) - --maltextract_ncbifiles [dir] Path to directory containing containing NCBI resource files (ncbi.tre and ncbi.map; avaliable: https://github.com/rhuebler/HOPS/) - --maltextract_filter [str] Specify which MaltExtract filter to use. Options: 'def_anc', 'ancient', 'default', 'crawl', 'scan', 'srna', 'assignment'. Default: '${params.maltextract_filter}' - --maltextract_toppercent [num] Specify percent of top alignments to use. Default: ${params.maltextract_toppercent} - --maltextract_destackingoff [bool] Turn off destacking. - --maltextract_downsamplingoff [bool] Turn off downsampling. - --maltextract_duplicateremovaloff [bool] Turn off duplicate removal. - --maltextract_matches [bool] Turn on exporting alignments of hits in BLAST format. - --maltextract_megansummary [bool] Turn on export of MEGAN summary files. - --maltextract_percentidentity [num] Minimum percent identity alignments are required to have to be reported. Recommended to set same as MALT parameter. Default: ${params.maltextract_percentidentity} - --maltextract_topalignment [int] Turn on using top alignments per read after filtering. - - Other options: - -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. - --max_memory [str] Memory limit for each step of pipeline. Should be in form e.g. --max_memory '8.GB'. Default: '${params.max_memory}' - --max_time [str] Time limit for each step of the pipeline. Should be in form e.g. --max_time '2.h'. Default: '${params.max_time}' - --max_cpus [str] Maximum number of CPUs to use for each step of the pipeline. Should be in form e.g. Default: '${params.max_cpus}' - --publish_dir_mode [str] Mode for publishing results in the output directory. Available: symlink, rellink, link, copy, copyNoFollow, move. Default: '${params.publish_dir_mode}' - --email [email] Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits - --email_on_fail [email] Same as --email, except only send mail if the workflow is not successful - --plaintext_email [email] Receive plain text emails rather than HTML - --max_multiqc_email_size [str] Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) - - AWSBatch options: - --awsqueue [str] The AWSBatch JobQueue that needs to be set when running on AWSBatch - --awsregion [str] The AWS Region for your AWS Batch job to run on - --awscli [str] Path to the AWS CLI tool - - For a full list and more information of available parameters, consider the documentation (https://github.com/nf-core/eager/). - """.stripIndent() -} - -/////////////////////////////////////////////////////////////////////////////// -/* -- SET UP CONFIGURATION VARIABLES -- */ -/////////////////////////////////////////////////////////////////////////////// - // Show help message params.help = false -if (params.help){ - helpMessage() +def json_schema = "$projectDir/nextflow_schema.json" +if (params.help) { + def command = "nextflow run nf-core/eager -profile --reads'*_R{1,2}.fastq.gz' --fasta '.fasta'" + log.info NfcoreSchema.params_help(workflow, params, json_schema, command) exit 0 } +//////////////////////////////////////////////////// +/* -- VALIDATE PARAMETERS -- */ +//////////////////////////////////////////////////// + +def unexpectedParams = [] +if (params.validate_params) { + unexpectedParams = NfcoreSchema.validateParameters(params, json_schema, log) +} + +// Info required for completion email and summary +def multiqc_report = [] + // Small console separator to make it easier to read errors after launch println "" + + //////////////////////////////////////////////////// /* -- VALIDATE INPUTS -- */ //////////////////////////////////////////////////// + /**FASTA input handling **/ @@ -366,6 +145,11 @@ if (!has_extension(params.input, "tsv") && params.skip_collapse && params.singl exit 1, "[nf-core/eager] error: --skip_collapse can only be set for paired_end samples." } +// Validate not trying to both skip collapse and skip trim +if ( params.skip_collapse && params.skip_trim ) { + exit 1, "[nf-core/eager error]: you have specified to skip both merging and trimming of paired end samples. Use --skip_adapterremoval instead." +} + // Host removal mode validation if (params.hostremoval_input_fastq){ if (!(['remove','replace'].contains(params.hostremoval_mode))) { @@ -386,8 +170,7 @@ if(params.run_bedtools_coverage && params.anno_file == ''){ if (!params.run_bedtools_coverage){ ch_anno_for_bedtools = Channel.empty() } else { - Channel - ch_anno_for_bedtools = Channel.fromPath(params.anno_file, checkIfExists: true) + ch_anno_for_bedtools = Channel.fromPath(params.anno_file, checkIfExists: true) .ifEmpty { exit 1, "[nf-core/eager] error: bedtools annotation file not found. Supplied parameter: --anno_file ${params.anno_file}."} } @@ -456,7 +239,7 @@ if (params.run_genotyping){ } if (params.genotyping_tool == 'angsd' && ! ( params.angsd_glformat == 'text' || params.angsd_glformat == 'binary' || params.angsd_glformat == 'binary_three' || params.angsd_glformat == 'beagle' ) ) { - exit 1, "[nf-core/eager] error: please check your ANGSD genotyping model! Options: 'text', 'binary', 'binary_three', 'beagle'. Found parameter: --angsd_glmodel '${params.angsd_glmodel}'." + exit 1, "[nf-core/eager] error: please check your ANGSD output format! Options: 'text', 'binary', 'binary_three', 'beagle'. Found parameter: --angsd_glformat '${params.angsd_glformat}'." } if ( !params.angsd_createfasta && params.angsd_fastamethod != 'random' ) { @@ -604,22 +387,10 @@ if ( params.maltextract_ncbifiles == '' ) { // Has the run name been specified by the user? // this has the bonus effect of catching both -name and --name -custom_runName = params.name if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { custom_runName = workflow.runName } -// Check AWS batch settings -if (workflow.profile.contains('awsbatch')) { - // AWSBatch sanity checking - if (!params.awsqueue || !params.awsregion) exit 1, "Specify correct --awsqueue and --awsregion parameters on AWSBatch!" - // Check outdir paths to be S3 buckets if running on AWSBatch - // related: https://github.com/nextflow-io/nextflow/issues/813 - if (!params.outdir.startsWith('s3:')) exit 1, "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!" - // Prevent trace files to be stored on S3 since S3 does not support rolling files. - if (params.tracedir.startsWith('s3:')) exit 1, "Specify a local tracedir or run without trace! S3 cannot be used for tracefiles." -} - //////////////////////////////////////////////////// /* -- CONFIG FILES -- */ //////////////////////////////////////////////////// @@ -724,100 +495,23 @@ ch_fastq_channel /* -- HEADER LOG INFO -- */ /////////////////////////////////////////////////// -log.info nfcoreHeader() -def summary = [:] -summary['Pipeline Name'] = 'nf-core/eager' -summary['Pipeline Version'] = workflow.manifest.version -if (workflow.revision) summary['Pipeline Release'] = workflow.revision -summary['Run Name'] = custom_runName ?: workflow.runName -summary['Input'] = params.input -summary['Convert input BAM?'] = params.run_convertinputbam ? 'Yes' : 'No' -summary['Fasta Ref'] = params.fasta -summary['BAM Index Type'] = (params.large_ref == "") ? 'BAI' : 'CSI' -if(params.bwa_index || params.bt2_index ) summary['BWA Index'] = "Yes" -summary['Skipping FASTQC?'] = params.skip_fastqc ? 'Yes' : 'No' -summary['Skipping AdapterRemoval?'] = params.skip_adapterremoval ? 'Yes' : 'No' -if (!params.skip_adapterremoval) { - summary['Skip Read Merging'] = params.skip_collapse ? 'Yes' : 'No' - summary['Skip Adapter Trimming'] = params.skip_trim ? 'Yes' : 'No' -} -summary['Running BAM filtering'] = params.run_bam_filtering ? 'Yes' : 'No' -if (params.run_bam_filtering) { - summary['Skip Read Merging'] = params.bam_unmapped_type +//Add header +log.info Headers.nf_core(workflow, params.monochrome_logs) + +//Add Summary Parameters +def summary_params = NfcoreSchema.params_summary_map(workflow, params, json_schema) +log.info NfcoreSchema.params_summary_log(workflow, params, json_schema) + +// Check that conda channels are set-up correctly +if (params.enable_conda) { + Checks.check_conda_channels(log) } -summary['Run Fastq Host Removal'] = params.hostremoval_input_fastq ? 'Yes' : 'No' -if (params.hostremoval_input_fastq){ - summary['Host removal mode'] = params.hostremoval_mode -} -summary['Skipping Preseq?'] = params.skip_preseq ? 'Yes' : 'No' -summary['Skipping Deduplication?'] = params.skip_deduplication ? 'Yes' : 'No' -summary['Skipping DamageProfiler?'] = params.skip_damage_calculation ? 'Yes' : 'No' -summary['Skipping Qualimap?'] = params.skip_qualimap ? 'Yes' : 'No' -summary['Run BAM Trimming?'] = params.run_trim_bam ? 'Yes' : 'No' -summary['Run PMDtools?'] = params.run_pmdtools ? 'Yes' : 'No' -summary['Run Genotyping?'] = params.run_genotyping ? 'Yes' : 'No' -if (params.run_genotyping){ - summary['Genotyping Tool?'] = params.genotyping_tool - summary['Genotyping BAM Input?'] = params.genotyping_source -} -summary['Run MultiVCFAnalyzer'] = params.run_multivcfanalyzer ? 'Yes' : 'No' -summary['Run VCF2Genome'] = params.run_vcf2genome ? 'Yes' : 'No' -summary['Run SexDetErrMine'] = params.run_sexdeterrmine ? 'Yes' : 'No' -summary['Run Nuclear Contamination Estimation'] = params.run_nuclear_contamination ? 'Yes' : 'No' -summary['Run Bedtools Coverage'] = params.run_bedtools_coverage ? 'Yes' : 'No' -summary['Run Metagenomic Binning'] = params.run_metagenomic_screening ? 'Yes' : 'No' -if (params.run_metagenomic_screening) { - summary['Metagenomic Tool'] = params.metagenomic_tool - summary['Run MaltExtract'] = params.run_maltextract ? 'Yes' : 'No' -} -summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" -summary['Output Dir'] = params.outdir -summary['Working Dir'] = workflow.workDir -summary['Container Engine'] = workflow.containerEngine -if(workflow.containerEngine) summary['Container'] = workflow.container -summary['Current Home'] = workflow.homeDir -summary['Current User'] = workflow.userName -summary['Working Dir'] = workflow.workDir -summary['Output Dir'] = params.outdir -summary['Script Dir'] = workflow.projectDir -summary['Config Profile'] = workflow.profile -summary['User'] = workflow.userName -if (workflow.profile.contains('awsbatch')) { - summary['AWS Region'] = params.awsregion - summary['AWS Queue'] = params.awsqueue - summary['AWS CLI'] = params.awscli -} -if(params.email) summary['E-mail Address'] = params.email -summary['Config Profile'] = workflow.profile -if (params.config_profile_description) summary['Config Description'] = params.config_profile_description -if (params.config_profile_contact) summary['Config Contact'] = params.config_profile_contact -if (params.config_profile_url) summary['Config URL'] = params.config_profile_url -if (params.email || params.email_on_fail) { - summary['E-mail Address'] = params.email - summary['E-mail on failure'] = params.email_on_fail - summary['MultiQC maxsize'] = params.max_multiqc_email_size -} -log.info summary.collect { k,v -> "${k.padRight(18)}: $v" }.join("\n") -log.info "-\033[2m--------------------------------------------------\033[0m-" + +// Check AWS batch settings +Checks.aws_batch(workflow, params) // Check the hostnames against configured profiles -checkHostname() - -Channel.from(summary.collect{ [it.key, it.value] }) - .map { k,v -> "
$k
${v ?: 'N/A'}
" } - .reduce { a, b -> return [a, b].join("\n ") } - .map { x -> """ - id: 'nf-core-eager-summary' - description: " - this information is collected when the pipeline is started." - section_name: 'nf-core/eager Workflow Summary' - section_href: 'https://github.com/nf-core/eager' - plot_type: 'html' - data: | -
- $x -
- """.stripIndent() } - .set { ch_workflow_summary } +Checks.hostname(workflow, params, log) log.info "Schaffa, Schaffa, Genome Baua!" @@ -957,7 +651,7 @@ process makeSeqDict { script: """ - picard -Xmx${task.memory.toMega()}M -Xms${task.memory.toMega()}M CreateSequenceDictionary R=$fasta O="${fasta.baseName}.dict" + picard -Xmx${task.memory.toMega()}M CreateSequenceDictionary R=$fasta O="${fasta.baseName}.dict" """ } @@ -1156,63 +850,121 @@ process adapter_removal { script: base = "${r1.baseName}_L${lane}" //This checks whether we skip trimming and defines a variable respectively - def trim_me = params.skip_trim ? '' : "--trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}" - def collapse_me = params.skip_collapse ? '' : '--collapse' - def preserve5p = params.preserve5p ? '--preserve5p' : '' - def mergedonly = params.mergedonly ? "Y" : "N" + def preserve5p = params.preserve5p ? '--preserve5p' : '' // applies to any AR command - doesn't affect output file combination - //PE mode, dependent on trim_me and collapse_me the respective procedure is run or not :-) - if ( seqtype == 'PE' && !params.skip_collapse && !params.skip_trim ){ + if ( seqtype == 'PE' && !params.skip_collapse && !params.skip_trim && !params.mergedonly && !params.preserve5p ) { """ mkdir -p output - AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe ${trim_me} --gzip --threads ${task.cpus} ${collapse_me} ${preserve5p} + + AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap} + + cat *.collapsed.gz *.collapsed.truncated.gz *.singleton.truncated.gz *.pair1.truncated.gz *.pair2.truncated.gz > output/${base}.pe.combined.tmp.fq.gz - #Combine files - if [ ${preserve5p} = "--preserve5p" ] && [ ${mergedonly} = "N" ]; then - cat *.collapsed.gz *.singleton.truncated.gz *.pair1.truncated.gz *.pair2.truncated.gz > output/${base}.pe.combined.tmp.fq.gz - elif [ ${preserve5p} = "--preserve5p" ] && [ ${mergedonly} = "Y" ] ; then - cat *.collapsed.gz > output/${base}.pe.combined.tmp.fq.gz - elif [ ${mergedonly} = "Y" ] ; then - cat *.collapsed.gz *.collapsed.truncated.gz > output/${base}.pe.combined.tmp.fq.gz - else - cat *.collapsed.gz *.collapsed.truncated.gz *.singleton.truncated.gz *.pair1.truncated.gz *.pair2.truncated.gz > output/${base}.pe.combined.tmp.fq.gz - fi + mv *.settings output/ ## Add R_ and L_ for unmerged reads for DeDup compatibility - AdapterRemovalFixPrefix output/${base}.pe.combined.tmp.fq.gz | pigz -p ${task.cpus} > output/${base}.pe.combined.fq.gz - + AdapterRemovalFixPrefix -Xmx${task.memory.toGiga()}g output/${base}.pe.combined.tmp.fq.gz > output/${base}.pe.combined.fq + + pigz -p ${task.cpus} output/${base}.pe.combined.fq + """ + //PE mode, collapse and trim, outputting all reads, preserving 5p + } else if (seqtype == 'PE' && !params.skip_collapse && !params.skip_trim && !params.mergedonly && params.preserve5p) { + """ + mkdir -p output + + AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap} + + cat *.collapsed.gz *.singleton.truncated.gz *.pair1.truncated.gz *.pair2.truncated.gz > output/${base}.pe.combined.tmp.fq.gz + mv *.settings output/ + + ## Add R_ and L_ for unmerged reads for DeDup compatibility + AdapterRemovalFixPrefix -Xmx${task.memory.toGiga()}g output/${base}.pe.combined.tmp.fq.gz > output/${base}.pe.combined.fq + + pigz -p ${task.cpus} output/${base}.pe.combined.fq """ - //PE, don't collapse, but trim reads - } else if ( seqtype == 'PE' && params.skip_collapse && !params.skip_trim ) { + // PE mode, collapse and trim but only output collapsed reads + } else if ( seqtype == 'PE' && !params.skip_collapse && !params.skip_trim && params.mergedonly && !params.preserve5p ) { """ mkdir -p output - AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} ${trim_me} ${collapse_me} ${preserve5p} - mv *.settings ${base}.pe.pair*.truncated.gz output/ + AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap} + + cat *.collapsed.gz *.collapsed.truncated.gz > output/${base}.pe.combined.tmp.fq.gz + + ## Add R_ and L_ for unmerged reads for DeDup compatibility + AdapterRemovalFixPrefix -Xmx${task.memory.toGiga()}g output/${base}.pe.combined.tmp.fq.gz > output/${base}.pe.combined.fq + + pigz -p ${task.cpus} output/${base}.pe.combined.fq + + mv *.settings output/ """ - //PE, collapse, but don't trim reads - } else if ( seqtype == 'PE' && !params.skip_collapse && params.skip_trim ) { + // PE mode, collapse and trim but only output collapsed reads, preserving 5p + } else if ( seqtype == 'PE' && !params.skip_collapse && !params.skip_trim && params.mergedonly && params.preserve5p ) { """ mkdir -p output - AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} ${collapse_me} ${trim_me} + AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap} - if [ ${mergedonly} = "Y" ]; then - cat *.collapsed.gz *.collapsed.truncated.gz > output/${base}.pe.combined.tmp.fq.gz - else - cat *.collapsed.gz *.collapsed.truncated.gz *.singleton.truncated.gz *.pair1.truncated.gz *.pair2.truncated.gz > output/${base}.pe.combined.tmp.fq.gz - fi + cat *.collapsed.gz > output/${base}.pe.combined.tmp.fq.gz + + ## Add R_ and L_ for unmerged reads for DeDup compatibility + AdapterRemovalFixPrefix -Xmx${task.memory.toGiga()}g output/${base}.pe.combined.tmp.fq.gz > output/${base}.pe.combined.fq + + pigz -p ${task.cpus} output/${base}.pe.combined.fq + mv *.settings output/ + """ + // PE mode, collapsing but skip trim, (output all reads). Note: seems to still generate `truncated` files for some reason, so merging for safety. + // Will still do default AR length filtering I guess + } else if ( seqtype == 'PE' && !params.skip_collapse && params.skip_trim && !params.mergedonly ) { + """ + mkdir -p output + AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --adapter1 "" --adapter2 "" + + cat *.collapsed.gz *.pair1.truncated.gz *.pair2.truncated.gz > output/${base}.pe.combined.tmp.fq.gz + ## Add R_ and L_ for unmerged reads for DeDup compatibility - AdapterRemovalFixPrefix output/${base}.pe.combined.tmp.fq.gz | pigz -p ${task.cpus} > output/${base}.pe.combined.fq.gz + AdapterRemovalFixPrefix -Xmx${task.memory.toGiga()}g output/${base}.pe.combined.tmp.fq.gz > output/${base}.pe.combined.fq + + pigz -p ${task.cpus} output/${base}.pe.combined.fq mv *.settings output/ """ - } else if ( seqtype != 'PE' ) { - //SE, collapse not possible, trim reads + // PE mode, collapsing but skip trim, and only output collapsed reads. Note: seems to still generate `truncated` files for some reason, so merging for safety. + // Will still do default AR length filtering I guess + } else if ( seqtype == 'PE' && !params.skip_collapse && params.skip_trim && params.mergedonly ) { """ mkdir -p output - AdapterRemoval --file1 ${r1} --basename ${base}.se --gzip --threads ${task.cpus} ${trim_me} ${preserve5p} + AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} --collapse ${preserve5p} --adapter1 "" --adapter2 "" + + cat *.collapsed.gz > output/${base}.pe.combined.tmp.fq.gz + ## Add R_ and L_ for unmerged reads for DeDup compatibility + AdapterRemovalFixPrefix -Xmx${task.memory.toGiga()}g output/${base}.pe.combined.tmp.fq.gz > output/${base}.pe.combined.fq + + pigz -p ${task.cpus} output/${base}.pe.combined.fq + + mv *.settings output/ + """ + // PE mode, skip collapsing but trim (output all reads, as merging not possible) - activates paired-end mapping! + } else if ( seqtype == 'PE' && params.skip_collapse && !params.skip_trim ) { + """ + mkdir -p output + AdapterRemoval --file1 ${r1} --file2 ${r2} --basename ${base}.pe --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} ${preserve5p} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap} + + mv ${base}.pe.pair*.truncated.gz *.settings output/ + """ + } else if ( seqtype != 'PE' && !params.skip_trim ) { + //SE, collapse not possible, trim reads only + """ + mkdir -p output + AdapterRemoval --file1 ${r1} --basename ${base}.se --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} ${preserve5p} --trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap} + mv *.settings *.se.truncated.gz output/ + """ + } else if ( seqtype != 'PE' && params.skip_trim ) { + //SE, collapse not possible, trim reads only + """ + mkdir -p output + AdapterRemoval --file1 ${r1} --basename ${base}.se --gzip --threads ${task.cpus} --qualitymax ${params.qualitymax} ${preserve5p} --adapter1 "" --adapter2 "" mv *.settings *.se.truncated.gz output/ """ } @@ -1853,9 +1605,9 @@ process samtools_filter { tag "$libraryid" publishDir "${params.outdir}/samtools/filter", mode: params.publish_dir_mode, saveAs: {filename -> - if (filename.indexOf(".fq.gz") > 0) "unmapped/$filename" - else if (filename.indexOf(".unmapped.bam") > 0) "unmapped/$filename" - else if (filename.indexOf(".filtered.bam")) filename + if (filename.indexOf(".fq.gz") > 0) "$filename" + else if (filename.indexOf(".unmapped.bam") > 0) "$filename" + else if (filename.indexOf(".filtered.bam")) "$filename" else null } @@ -1874,82 +1626,81 @@ process samtools_filter { shell: size = !{params.large_ref} ? '-c' : '' - if ( "${params.bam_unmapped_type}" == "keep" ) { + // Unmapped/MAPQ Filtering WITHOUT min-length filtering + if ( "${params.bam_unmapped_type}" == "keep" && params.bam_filter_minreadlength == 0 ) { ''' - ## Unmapped and MAPQ filtering - samtools view -h -b !{bam} -@ !{task.cpus} -q !{params.bam_mapping_quality_threshold} -o tmp_mapped.bam - - ## Mapped LEN filtering - if [[ !{params.bam_filter_minreadlength} -eq 0 ]]; then - mv tmp_mapped.bam !{libraryid}.filtered.bam - else - filter_bam_fragment_length.py -a -l !{params.bam_filter_minreadlength} -o !{libraryid} tmp_mapped.bam - fi + samtools view -h -b !{bam} -@ !{task.cpus} -q !{params.bam_mapping_quality_threshold} -o !{libraryid}.filtered.bam + samtools index !{libraryid}.filtered.bam !{size} + ''' + } else if ( "${params.bam_unmapped_type}" == "discard" && params.bam_filter_minreadlength == 0 ){ + ''' + samtools view -h -b !{bam} -@ !{task.cpus} -F4 -q !{params.bam_mapping_quality_threshold} -o !{libraryid}.filtered.bam + samtools index !{libraryid}.filtered.bam !{size} + ''' + } else if ( "${params.bam_unmapped_type}" == "bam" && params.bam_filter_minreadlength == 0 ){ + ''' + samtools view -h !{bam} | samtools view - -@ !{task.cpus} -f4 -o !{libraryid}.unmapped.bam + samtools view -h !{bam} | samtools view - -@ !{task.cpus} -F4 -q !{params.bam_mapping_quality_threshold} -o !{libraryid}.filtered.bam + samtools index !{libraryid}.filtered.bam !{size} + ''' + } else if ( "${params.bam_unmapped_type}" == "fastq" && params.bam_filter_minreadlength == 0 ){ + ''' + samtools view -h !{bam} | samtools view - -@ !{task.cpus} -f4 -o !{libraryid}.unmapped.bam + samtools view -h !{bam} | samtools view - -@ !{task.cpus} -F4 -q !{params.bam_mapping_quality_threshold} -o !{libraryid}.filtered.bam + samtools index !{libraryid}.filtered.bam !{size} + ## FASTQ + samtools fastq -tn !{libraryid}.unmapped.bam | pigz -p !{task.cpus} > !{libraryid}.unmapped.fastq.gz + rm !{libraryid}.unmapped.bam + ''' + } else if ( "${params.bam_unmapped_type}" == "both" && params.bam_filter_minreadlength == 0 ){ + ''' + samtools view -h !{bam} | samtools view - -@ !{task.cpus} -f4 -o !{libraryid}.unmapped.bam + samtools view -h !{bam} | samtools view - -@ !{task.cpus} -F4 -q !{params.bam_mapping_quality_threshold} -o !{libraryid}.filtered.bam + samtools index !{libraryid}.filtered.bam !{size} + + ## FASTQ + samtools fastq -tn !{libraryid}.unmapped.bam | pigz -p !{task.cpus} > !{libraryid}.unmapped.fastq.gz + ''' + // Unmapped/MAPQ Filtering WITH min-length filtering + } else if ( "${params.bam_unmapped_type}" == "keep" && params.bam_filter_minreadlength != 0 ) { + ''' + samtools view -h -b !{bam} -@ !{task.cpus} -q !{params.bam_mapping_quality_threshold} -o tmp_mapped.bam + filter_bam_fragment_length.py -a -l !{params.bam_filter_minreadlength} -o !{libraryid} tmp_mapped.bam samtools index !{libraryid}.filtered.bam !{size} ''' - } else if("${params.bam_unmapped_type}" == "discard"){ + } else if ( "${params.bam_unmapped_type}" == "discard" && params.bam_filter_minreadlength != 0 ){ ''' - ## Unmapped and MAPQ filtering samtools view -h -b !{bam} -@ !{task.cpus} -F4 -q !{params.bam_mapping_quality_threshold} -o tmp_mapped.bam - - ## Mapped LEN filtering - if [[ !{params.bam_filter_minreadlength} -eq 0 ]]; then - mv tmp_mapped.bam !{libraryid}.filtered.bam - else - filter_bam_fragment_length.py -a -l !{params.bam_filter_minreadlength} -o !{libraryid} tmp_mapped.bam - fi - + filter_bam_fragment_length.py -a -l !{params.bam_filter_minreadlength} -o !{libraryid} tmp_mapped.bam samtools index !{libraryid}.filtered.bam !{size} ''' - } else if("${params.bam_unmapped_type}" == "bam"){ + } else if ( "${params.bam_unmapped_type}" == "bam" && params.bam_filter_minreadlength != 0 ){ ''' - ## Unmapped and MAPQ filtering samtools view -h !{bam} | samtools view - -@ !{task.cpus} -f4 -o !{libraryid}.unmapped.bam samtools view -h !{bam} | samtools view - -@ !{task.cpus} -F4 -q !{params.bam_mapping_quality_threshold} -o tmp_mapped.bam - - ## Mapped LEN filtering - if [[ !{params.bam_filter_minreadlength} -eq 0 ]]; then - mv tmp_mapped.bam !{libraryid}.filtered.bam - else - filter_bam_fragment_length.py -a -l !{params.bam_filter_minreadlength} -o !{libraryid} tmp_mapped.bam - fi - + filter_bam_fragment_length.py -a -l !{params.bam_filter_minreadlength} -o !{libraryid} tmp_mapped.bam samtools index !{libraryid}.filtered.bam !{size} ''' - } else if("${params.bam_unmapped_type}" == "fastq"){ + } else if ( "${params.bam_unmapped_type}" == "fastq" && params.bam_filter_minreadlength != 0 ){ ''' - ## Unmapped and MAPQ filtering samtools view -h !{bam} | samtools view - -@ !{task.cpus} -f4 -o !{libraryid}.unmapped.bam samtools view -h !{bam} | samtools view - -@ !{task.cpus} -F4 -q !{params.bam_mapping_quality_threshold} -o tmp_mapped.bam - - ## Mapped LEN filtering - if [[ !{params.bam_filter_minreadlength} -eq 0 ]]; then - mv tmp_mapped.bam !{libraryid}.filtered.bam - else - filter_bam_fragment_length.py -a -l !{params.bam_filter_minreadlength} -o !{libraryid} tmp_mapped.bam - fi - + filter_bam_fragment_length.py -a -l !{params.bam_filter_minreadlength} -o !{libraryid} tmp_mapped.bam samtools index !{libraryid}.filtered.bam !{size} ## FASTQ samtools fastq -tn !{libraryid}.unmapped.bam | pigz -p !{task.cpus} > !{libraryid}.unmapped.fastq.gz rm !{libraryid}.unmapped.bam ''' - } else if("${params.bam_unmapped_type}" == "both"){ + } else if ( "${params.bam_unmapped_type}" == "both" && params.bam_filter_minreadlength != 0 ){ ''' - ## Unmapped and MAPQ filtering samtools view -h !{bam} | samtools view - -@ !{task.cpus} -f4 -o !{libraryid}.unmapped.bam samtools view -h !{bam} | samtools view - -@ !{task.cpus} -F4 -q !{params.bam_mapping_quality_threshold} -o tmp_mapped.bam - - ## Mapped LEN filtering - if [[ !{params.bam_filter_minreadlength} -eq 0 ]]; then - mv tmp_mapped.bam !{libraryid}.filtered.bam - else - filter_bam_fragment_length.py -a -l !{params.bam_filter_minreadlength} -o !{libraryid} tmp_mapped.bam - fi - + filter_bam_fragment_length.py -a -l !{params.bam_filter_minreadlength} -o !{libraryid} tmp_mapped.bam samtools index !{libraryid}.filtered.bam !{size} + + ## FASTQ samtools fastq -tn !{libraryid}.unmapped.bam | pigz -p !{task.cpus} > !{libraryid}.unmapped.fastq.gz ''' } @@ -2057,21 +1808,26 @@ process dedup{ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("${libraryid}_rmdup.bam"), path("*.{bai,csi}") into ch_output_from_dedup, ch_dedup_for_libeval script: - def outname = "${bam.baseName}" def treat_merged = params.dedup_all_merged ? '-m' : '' def size = params.large_ref ? '-c' : '' + if ( bam.baseName != libraryid ) { + // To make sure direct BAMs have a clean name + """ + mv ${bam} ${libraryid}.bam + dedup -Xmx${task.memory.toGiga()}g -i ${libraryid}.bam $treat_merged -o . -u + mv *.log dedup.log + samtools sort -@ ${task.cpus} "${libraryid}"_rmdup.bam -o "${libraryid}"_rmdup.bam + samtools index "${libraryid}"_rmdup.bam ${size} + """ + } else { """ - ## To make sure direct BAMs have a clean name - if [[ "${bam}" != "${libraryid}.bam" ]]; then - mv ${bam} ${libraryid}.bam - fi - dedup -Xmx${task.memory.toGiga()}g -i ${libraryid}.bam $treat_merged -o . -u mv *.log dedup.log samtools sort -@ ${task.cpus} "${libraryid}"_rmdup.bam -o "${libraryid}"_rmdup.bam samtools index "${libraryid}"_rmdup.bam ${size} """ + } } process markduplicates{ @@ -2091,17 +1847,22 @@ process markduplicates{ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("${libraryid}_rmdup.bam"), path("*.{bai,csi}") into ch_output_from_markdup, ch_markdup_for_libeval script: - def outname = "${bam.baseName}" def size = params.large_ref ? '-c' : '' - """ - ## To make sure direct BAMs have a clean name - if [[ "${bam}" != "${libraryid}.bam" ]]; then - mv ${bam} ${libraryid}.bam - fi - picard -Xmx${task.memory.toMega()}M -Xms${task.memory.toMega()}M MarkDuplicates INPUT=${libraryid}.bam OUTPUT=${libraryid}_rmdup.bam REMOVE_DUPLICATES=TRUE AS=TRUE METRICS_FILE="${libraryid}_rmdup.metrics" VALIDATION_STRINGENCY=SILENT + if ( bam.baseName != libraryid ) { + // To make sure direct BAMs have a clean name + """ + mv ${bam} ${libraryid}.bam + picard -Xmx${task.memory.toMega()}M MarkDuplicates INPUT=${libraryid}.bam OUTPUT=${libraryid}_rmdup.bam REMOVE_DUPLICATES=TRUE AS=TRUE METRICS_FILE="${libraryid}_rmdup.metrics" VALIDATION_STRINGENCY=SILENT samtools index ${libraryid}_rmdup.bam ${size} """ + } else { + """ + picard -Xmx${task.memory.toMega()}M MarkDuplicates INPUT=${libraryid}.bam OUTPUT=${libraryid}_rmdup.bam REMOVE_DUPLICATES=TRUE AS=TRUE METRICS_FILE="${libraryid}_rmdup.metrics" VALIDATION_STRINGENCY=SILENT + samtools index ${libraryid}_rmdup.bam ${size} + """ + } + } // This is for post-deduplcation per-library evaluation steps _without_ any @@ -2465,7 +2226,7 @@ process additional_library_merge { ch_trimmed_formerge.skip_merging .mix(ch_output_from_trimmerge) - .into{ ch_output_from_bamutils; ch_addlibmerge_for_qualimap; ch_for_sexdeterrmine } + .into{ ch_output_from_bamutils; ch_addlibmerge_for_qualimap; ch_for_sexdeterrmine_prep } // General mapping quality statistics for whole reference sequence - e.g. X and % coverage @@ -2762,10 +2523,11 @@ process genotyping_angsd { } def angsd_fasta = !params.angsd_createfasta ? '' : params.angsd_fastamethod == 'random' ? '-doFasta 1 -doCounts 1' : '-doFasta 2 -doCounts 1' + def angsd_majorminor = params.angsd_glformat != "beagle" ? '' : '-doMajorMinor 1' """ echo ${bam} > bam.filelist mkdir angsd - angsd -bam bam.filelist -nThreads ${task.cpus} -GL ${angsd_glmodel} -doGlF ${angsd_glformat} ${angsd_fasta} -out ${samplename}.angsd + angsd -bam bam.filelist -nThreads ${task.cpus} -GL ${angsd_glmodel} -doGlF ${angsd_glformat} ${angsd_majorminor} ${angsd_fasta} -out ${samplename}.angsd """ } @@ -2871,13 +2633,33 @@ process multivcfanalyzer { // Human biological sex estimation +// rename to prevent single/double stranded library sample name-based file conflict +process sexdeterrmine_prep { + label 'sc_small' + + input: + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(bam), path(bai) from ch_for_sexdeterrmine_prep + + output: + file "*_{single,double}strand.bam" into ch_prepped_for_sexdeterrmine + + when: + params.run_sexdeterrmine + + script: + """ + mv ${bam} ${bam.baseName}_${strandedness}strand.bam + """ + +} + // As we collect all files for a single sex_deterrmine run, we DO NOT use the normal input/output tuple -process sex_deterrmine { +process sexdeterrmine { label 'sc_small' publishDir "${params.outdir}/sex_determination", mode: params.publish_dir_mode input: - path bam from ch_for_sexdeterrmine.map { it[7] }.collect() + path bam from ch_prepped_for_sexdeterrmine.collect() path(bed) from ch_bed_for_sexdeterrmine output: @@ -2890,11 +2672,7 @@ process sex_deterrmine { script: def filter = bed.getName() != 'nf-core_eager_dummy.txt' ? "-b $bed" : '' """ - - for i in *.bam; do - echo \$i >> bamlist.txt - done - + ls *.bam >> bamlist.txt samtools depth -aa -q30 -Q30 $filter -f bamlist.txt | sexdeterrmine -f bamlist.txt > SexDet.txt """ } @@ -2932,7 +2710,7 @@ process print_nuclear_contamination{ params.run_nuclear_contamination input: - val 'Contam' from ch_from_nuclear_contamination.map { it[7] }.collect() + path Contam from ch_from_nuclear_contamination.map { it[7] }.collect() output: file 'nuclear_contamination.txt' @@ -3108,14 +2886,16 @@ if (params.run_metagenomic_screening && params.database.endsWith(".tar.gz") && p path(dbname) into ch_krakendb script: - dbname = params.database.tokenize("/")[-1].tokenize(".")[0] + dbname = ckdb.toString() - '.tar.gz' """ tar xvzf $ckdb + mkdir -p $dbname + mv *.k2d $dbname || echo "nothing to do" """ } } else if (! params.database.endsWith(".tar.gz") && params.run_metagenomic_screening && params.metagenomic_tool == 'kraken') { - ch_krakendb = path(params.database) + ch_krakendb = Channel.fromPath(params.database).first() } else { ch_krakendb = Channel.empty() } @@ -3134,15 +2914,17 @@ process kraken { output: file "*.kraken.out" into ch_kraken_out - tuple prefix, path("*.kreport") into ch_kraken_report, ch_kraken_for_multiqc + tuple prefix, path("*.kraken2_report") into ch_kraken_report, ch_kraken_for_multiqc script: prefix = fastq.toString().tokenize('.')[0] out = prefix+".kraken.out" - kreport = prefix+".kreport" + kreport = prefix+".kraken2_report" + kreport_old = prefix+".kreport" """ - kraken2 --db ${krakendb} --threads ${task.cpus} --output $out --report $kreport $fastq + kraken2 --db ${krakendb} --threads ${task.cpus} --output $out --report-minimizer-data --report $kreport $fastq + cut -f1-3,6-8 $kreport > $kreport_old """ } @@ -3154,12 +2936,13 @@ process kraken_parse { tuple val(name), path(kraken_r) from ch_kraken_report output: - tuple val(name), path('*.kraken_parsed.csv') into ch_kraken_parsed + path('*_kraken_parsed.csv') into ch_kraken_parsed script: - out = name+".kraken_parsed.csv" + read_out = name+".read_kraken_parsed.csv" + kmer_out = name+".kmer_kraken_parsed.csv" """ - kraken_parse.py -c ${params.metagenomic_min_support_reads} -o $out $kraken_r + kraken_parse.py -c ${params.metagenomic_min_support_reads} -or $read_out -ok $kmer_out $kraken_r """ } @@ -3167,15 +2950,16 @@ process kraken_merge { publishDir "${params.outdir}/metagenomic_classification/kraken", mode: params.publish_dir_mode input: - file csv_count from ch_kraken_parsed.map{ it[1] }.collect() + file csv_count from ch_kraken_parsed.collect() output: - path('kraken_count_table.csv') + path('*.csv') script: - out = "kraken_count_table.csv" + read_out = "kraken_read_count.csv" + kmer_out = "kraken_kmer_duplication.csv" """ - merge_kraken_res.py -o $out + merge_kraken_res.py -or $read_out -ok $kmer_out """ } @@ -3259,6 +3043,8 @@ process get_software_versions { } // MultiQC file generation for pipeline report +def workflow_summary = NfcoreSchema.params_summary_multiqc(workflow, summary_params) +ch_workflow_summary = Channel.value(workflow_summary) process multiqc { label 'sc_medium' @@ -3292,7 +3078,6 @@ process multiqc { file ('hops/*') from ch_hops_for_multiqc.collect().ifEmpty([]) file ('nuclear_contamination/*') from ch_nuclear_contamination_for_multiqc.collect().ifEmpty([]) file ('genotyping/*') from ch_eigenstrat_snp_cov_for_multiqc.collect().ifEmpty([]) - file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml") output: @@ -3300,8 +3085,13 @@ process multiqc { file "*_data" script: - def rtitle = custom_runName ? "--title \"$custom_runName\"" : '' - def rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' + rtitle = '' + rfilename = '' + if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { + rtitle = "--title \"${workflow.runName}\"" + rfilename = "--filename " + workflow.runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" + } + def custom_config_file = params.multiqc_config ? "--config $mqc_custom_config" : '' """ multiqc -f $rtitle $rfilename $multiqc_config $custom_config_file . @@ -3311,171 +3101,21 @@ process multiqc { // Send completion emails if requested, so user knows data is ready workflow.onComplete { + Completion.email(workflow, params, summary_params, projectDir, log, multiqc_report) + Completion.summary(workflow, params, log, fail_percent_mapped, pass_percent_mapped) +} - // Set up the e-mail variables - def subject = "[nf-core/eager] Successful: $workflow.runName" - if (!workflow.success) { - subject = "[nf-core/eager] FAILED: $workflow.runName" - } - def email_fields = [:] - email_fields['version'] = workflow.manifest.version - email_fields['runName'] = custom_runName ?: workflow.runName - email_fields['success'] = workflow.success - email_fields['dateComplete'] = workflow.complete - email_fields['duration'] = workflow.duration - email_fields['exitStatus'] = workflow.exitStatus - email_fields['errorMessage'] = (workflow.errorMessage ?: 'None') - email_fields['errorReport'] = (workflow.errorReport ?: 'None') - email_fields['commandLine'] = workflow.commandLine - email_fields['projectDir'] = workflow.projectDir - email_fields['summary'] = summary - email_fields['summary']['Date Started'] = workflow.start - email_fields['summary']['Date Completed'] = workflow.complete - email_fields['summary']['Pipeline script file path'] = workflow.scriptFile - email_fields['summary']['Pipeline script hash ID'] = workflow.scriptId - if (workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository - if (workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId - if (workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision - email_fields['summary']['Nextflow Version'] = workflow.nextflow.version - email_fields['summary']['Nextflow Build'] = workflow.nextflow.build - email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp - - // On success try attach the multiqc report - def mqc_report = null - try { - if (workflow.success) { - mqc_report = ch_multiqc_report.getVal() - if (mqc_report instanceof ArrayList) { - log.warn "[nf-core/eager] Found multiple reports from process 'multiqc', will use only one" - mqc_report = mqc_report[0] - } - } - } catch (all) { - log.warn "[nf-core/eager] Could not attach MultiQC report to summary email" - } - - // Check if we are only sending emails on failure - email_address = params.email - if (!params.email && params.email_on_fail && !workflow.success) { - email_address = params.email_on_fail - } - - // Render the TXT template - def engine = new groovy.text.GStringTemplateEngine() - def tf = new File("$projectDir/assets/email_template.txt") - def txt_template = engine.createTemplate(tf).make(email_fields) - def email_txt = txt_template.toString() - - // Render the HTML template - def hf = new File("$projectDir/assets/email_template.html") - def html_template = engine.createTemplate(hf).make(email_fields) - def email_html = html_template.toString() - - // Render the sendmail template - def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ] - def sf = new File("$projectDir/assets/sendmail_template.txt") - def sendmail_template = engine.createTemplate(sf).make(smail_fields) - def sendmail_html = sendmail_template.toString() - - // Send the HTML e-mail - if (email_address) { - try { - if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } - // Try to send HTML e-mail using sendmail - [ 'sendmail', '-t' ].execute() << sendmail_html - log.info "[nf-core/eager] Sent summary e-mail to $email_address (sendmail)" - } catch (all) { - // Catch failures and try with plaintext - def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ] - if (mqc_report == NULL) { - log.warn "[nf-core/eager] Could not attach MultiQC report to summary email" - } else if ( mqc_report.size() <= params.max_multiqc_email_size.toBytes() ) { - mail_cmd += [ '-A', mqc_report ] - } - mail_cmd.execute() << email_html - log.info "[nf-core/eager] Sent summary e-mail to $email_address (mail)" - } - } - - // Write summary e-mail HTML to a file - def output_d = new File("${params.outdir}/pipeline_info/") - if (!output_d.exists()) { - output_d.mkdirs() - } - def output_hf = new File(output_d, "pipeline_report.html") - output_hf.withWriter { w -> w << email_html } - def output_tf = new File(output_d, "pipeline_report.txt") - output_tf.withWriter { w -> w << email_txt } - - c_green = params.monochrome_logs ? '' : "\033[0;32m"; - c_purple = params.monochrome_logs ? '' : "\033[0;35m"; - c_red = params.monochrome_logs ? '' : "\033[0;31m"; - c_reset = params.monochrome_logs ? '' : "\033[0m"; - - if (workflow.stats.ignoredCount > 0 && workflow.success) { - log.info "-${c_purple}Warning, pipeline completed, but with errored process(es) ${c_reset}-" - log.info "-${c_red}Number of ignored errored process(es) : ${workflow.stats.ignoredCount} ${c_reset}-" - log.info "-${c_green}Number of successfully ran process(es) : ${workflow.stats.succeedCount} ${c_reset}-" - } - - if (workflow.success) { - log.info "-${c_purple}[nf-core/eager]${c_green} Pipeline completed successfully${c_reset}-" - } else { - checkHostname() - log.info "-${c_purple}[nf-core/eager]${c_red} Pipeline completed with errors${c_reset}-" +workflow.onError { + // Print unexpected parameters + for (p in unexpectedParams) { + log.warn "Unexpected parameter: ${p}" } - } ///////////////////////////////////// /* -- AUXILARY FUNCTIONS -- */ ///////////////////////////////////// -def nfcoreHeader() { - // Log colors ANSI codes - c_black = params.monochrome_logs ? '' : "\033[0;30m"; - c_blue = params.monochrome_logs ? '' : "\033[0;34m"; - c_cyan = params.monochrome_logs ? '' : "\033[0;36m"; - c_dim = params.monochrome_logs ? '' : "\033[2m"; - c_green = params.monochrome_logs ? '' : "\033[0;32m"; - c_purple = params.monochrome_logs ? '' : "\033[0;35m"; - c_reset = params.monochrome_logs ? '' : "\033[0m"; - c_white = params.monochrome_logs ? '' : "\033[0;37m"; - c_yellow = params.monochrome_logs ? '' : "\033[0;33m"; - - return """ -${c_dim}--------------------------------------------------${c_reset}- - ${c_green},--.${c_black}/${c_green},-.${c_reset} - ${c_blue} ___ __ __ __ ___ ${c_green}/,-._.--~\'${c_reset} - ${c_blue} |\\ | |__ __ / ` / \\ |__) |__ ${c_yellow}} {${c_reset} - ${c_blue} | \\| | \\__, \\__/ | \\ |___ ${c_green}\\`-._,-`-,${c_reset} - ${c_green}`._,._,\'${c_reset} - ${c_purple} nf-core/eager v${workflow.manifest.version}${c_reset} - -${c_dim}--------------------------------------------------${c_reset}- - """.stripIndent() -} - - -def checkHostname() { - def c_reset = params.monochrome_logs ? '' : "\033[0m" - def c_white = params.monochrome_logs ? '' : "\033[0;37m" - def c_red = params.monochrome_logs ? '' : "\033[1;91m" - def c_yellow_bold = params.monochrome_logs ? '' : "\033[1;93m" - if (params.hostnames) { - def hostname = "hostname".execute().text.trim() - params.hostnames.each { prof, hnames -> - hnames.each { hname -> - if (hostname.contains(hname) && !workflow.profile.contains(prof)) { - log.error "====================================================\n" + - " ${c_red}WARNING!${c_reset} You are running with `-profile $workflow.profile`\n" + - " but your machine hostname is ${c_white}'$hostname'${c_reset}\n" + - " ${c_yellow_bold}It's highly recommended that you use `-profile $prof${c_reset}`\n" + - "============================================================" - } - } - } - } -} - // Channelling the TSV file containing FASTQ or BAM def extract_data(tsvFile) { Channel.fromPath(tsvFile) @@ -3488,6 +3128,18 @@ def extract_data(tsvFile) { checkNumberOfItem(row, 11) + if ( row.Sample_Name.isEmpty() ) exit 1, "[nf-core/eager] error: the Sample_Name column is empty. Ensure all cells are filled or contain 'NA' for optional fields. Check row:\n ${row}" + if ( row.Library_ID.isEmpty() ) exit 1, "[nf-core/eager] error: the Library_ID column is empty. Ensure all cells are filled or contain 'NA' for optional fields. Check row:\n ${row}" + if ( row.Lane.isEmpty() ) exit 1, "[nf-core/eager] error: the Lane column is empty. Ensure all cells are filled or contain 'NA' for optional fields. Check row:\n ${row}" + if ( row.Colour_Chemistry.isEmpty() ) exit 1, "[nf-core/eager] error: the Colour_Chemistry column is empty. Ensure all cells are filled or contain 'NA' for optional fields. Check row:\n ${row}" + if ( row.SeqType.isEmpty() ) exit 1, "[nf-core/eager] error: the SeqType column is empty. Ensure all cells are filled or contain 'NA' for optional fields. Check row:\n ${row}" + if ( row.Organism.isEmpty() ) exit 1, "[nf-core/eager] error: the Organism column is empty. Ensure all cells are filled or contain 'NA' for optional fields. Check row:\n ${row}" + if ( row.Strandedness.isEmpty() ) exit 1, "[nf-core/eager] error: the Strandedness column is empty. Ensure all cells are filled or contain 'NA' for optional fields. Check row:\n ${row}" + if ( row.UDG_Treatment.isEmpty() ) exit 1, "[nf-core/eager] error: the UDG_Treatment column is empty. Ensure all cells are filled or contain 'NA' for optional fields. Check row:\n ${row}" + if ( row.R1.isEmpty() ) exit 1, "[nf-core/eager] error: the R1 column is empty. Ensure all cells are filled or contain 'NA' for optional fields. Check row:\n ${row}" + if ( row.R2.isEmpty() ) exit 1, "[nf-core/eager] error: the R2 column is empty. Ensure all cells are filled or contain 'NA' for optional fields. Check row:\n ${row}" + if ( row.BAM.isEmpty() ) exit 1, "[nf-core/eager] error: the BAM column is empty. Ensure all cells are filled or contain 'NA' for optional fields. Check row:\n ${row}" + def samplename = row.Sample_Name def libraryid = row.Library_ID def lane = row.Lane @@ -3501,10 +3153,10 @@ def extract_data(tsvFile) { def bam = row.BAM.matches('NA') ? 'NA' : return_file(row.BAM) // check no empty metadata fields - if (samplename == '' || libraryid == '' || lane == '' || colour == '' || seqtype == '' || seqtype == '' || udg == '' || r1 == '' || r2 == '') exit 1, "[nf-core/eager] error: a field does not contain any information. Ensure all cells are filled or contain 'NA' for optional fields. Check row:\n ${row}" + if (samplename == '' || libraryid == '' || lane == '' || colour == '' || seqtype == '' || organism == '' || strandedness == '' || udg == '' || r1 == '' || r2 == '' || bam == '') exit 1, "[nf-core/eager] error: a field/column does not contain any information. Ensure all cells are filled or contain 'NA' for optional fields. Check row:\n ${row}" // Check no 'empty' rows - if (r1.matches('NA') && r2.matches('NA') && bam.matches('NA') && bai.matches('NA')) exit 1, "[nf-core/eager] error: A row in your TSV appears to have all files defined as NA. See '--help' flag and documentation under 'running the pipeline' for more information. Check row for: ${samplename}" + if (r1.matches('NA') && r2.matches('NA') && bam.matches('NA')) exit 1, "[nf-core/eager] error: A row in your TSV appears to have all files defined as NA. See '--help' flag and documentation under 'running the pipeline' for more information. Check row for: ${samplename}" // Ensure BAMs aren't submitted with PE if (!bam.matches('NA') && seqtype.matches('PE')) exit 1, "[nf-core/eager] error: BAM input rows in TSV cannot be set as PE, only SE. See '--help' flag and documentation under 'running the pipeline' for more information. Check row for: ${samplename}" @@ -3513,7 +3165,7 @@ def extract_data(tsvFile) { if (!udg.matches('none') && !udg.matches('half') && !udg.matches('full')) exit 1, "[nf-core/eager] error: UDG treatment can only be 'none', 'half' or 'full'. See '--help' flag and documentation under 'running the pipeline' for more information. You have '${udg}'" // Check valid colour chemistry - if (!colour == 2 && !colour == 4) exit 1, "[nf-core/eager] error: Colour chemistry in TSV can either be 2 (e.g. NextSeq/NovaSeq) or 4 (e.g. HiSeq/MiSeq)" + if (!colour.matches('2') && !colour.matches('4')) exit 1, "[nf-core/eager] error: Colour chemistry in TSV can either be 2 (e.g. NextSeq/NovaSeq) or 4 (e.g. HiSeq/MiSeq)" // Ensure that we do not accept incompatible chemistry setup if (!seqtype.matches('PE') && !seqtype.matches('SE')) exit 1, "[nf-core/eager] error: SeqType for one or more rows in TSV is neither SE nor PE! see '--help' flag and documentation under 'running the pipeline' for more information. You have: '${seqtype}'" diff --git a/nextflow.config b/nextflow.config index e4c0ddfd6..5a87732e0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -12,6 +12,7 @@ params { single_end = false outdir = './results' publish_dir_mode = 'copy' + config_profile_name = '' // aws awsqueue = '' @@ -19,6 +20,10 @@ params { awscli = '' //Pipeline options + enable_conda = false + validate_params = true + schema_ignore_params = 'genomes' + show_hidden_params = false //Input reads input = null @@ -40,8 +45,6 @@ params { seq_dict = '' large_ref = false save_reference = false - saveTrimmed = true - saveAlignedIntermediates = false //Skipping parts of the pipeline for impatient users skip_fastqc = false @@ -65,6 +68,7 @@ params { skip_trim = false preserve5p = false mergedonly = false + qualitymax = 41 //Mapping algorithm mapper = 'bwaaln' @@ -224,7 +228,6 @@ params { maltextract_topalignment = false // Boilerplate options - name = false multiqc_config = false email = false email_on_fail = false @@ -234,7 +237,7 @@ params { help = false igenomes_base = 's3://ngi-igenomes/igenomes/' tracedir = "${params.outdir}/pipeline_info" - igenomes_ignore = false + igenomes_ignore = true custom_config_version = 'master' custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" hostnames = false @@ -251,7 +254,7 @@ params { // Container slug. Stable releases should specify release tag! // Developmental code should specify :dev -process.container = 'nfcore/eager:2.3.1' +process.container = 'nfcore/eager:2.3.2' // Load base.config by default for all pipelines includeConfig 'conf/base.config' @@ -271,7 +274,10 @@ try { } profiles { - conda { process.conda = "$projectDir/environment.yml" } + conda { + process.conda = "$projectDir/environment.yml" + params.enable_conda = true + } debug { process.beforeScript = 'echo $HOSTNAME' } docker { docker.enabled = true @@ -344,8 +350,8 @@ manifest { homePage = 'https://github.com/nf-core/eager' description = 'A fully reproducible and state-of-the-art ancient DNA analysis pipeline' mainScript = 'main.nf' - nextflowVersion = '!>=20.04.0' - version = '2.3.1' + nextflowVersion = '!>=20.07.1' + version = '2.3.2' } // Function to ensure that resource requirements don't go beyond diff --git a/nextflow_schema.json b/nextflow_schema.json index 3738a1aea..292a5fdd7 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -95,9 +95,9 @@ }, "genome": { "type": "string", - "description": "Name of iGenomes reference (required if not FASTA reference).", + "description": "Name of iGenomes reference (required if not FASTA reference). Requires argument `--igenomes_ignore false`, as iGenomes is ignored by default in nf-core/eager", "fa_icon": "fas fa-book", - "help_text": "Alternatively to `--fasta`, the pipeline config files come bundled with paths to the Illumina iGenomes reference index files. If running with docker or AWS, the configuration is set up to use the [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) resource.\n\nThere are 31 different species supported in the iGenomes references. To run the pipeline, you must specify which to use with the `--genome` flag.\n\nYou can find the keys to specify the genomes in the [iGenomes config file](../conf/igenomes.config). Common genomes that are supported are:\n\n- Human\n - `--genome GRCh37`\n - `--genome GRCh38`\n- Mouse *\n - `--genome GRCm38`\n- _Drosophila_ *\n - `--genome BDGP6`\n- _S. cerevisiae_ *\n - `--genome 'R64-1-1'`\n\n> \\* Not bundled with nf-core eager by default.\n\nNote that you can use the same configuration setup to save sets of reference files for your own use, even if they are not part of the iGenomes resource. See the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for instructions on where to save such a file.\n\nThe syntax for this reference configuration is as follows:\n\n```nextflow\nparams {\n genomes {\n 'GRCh37' {\n fasta = ''\n }\n // Any number of additional genomes, key is used with --genome\n }\n}\n```" + "help_text": "Alternatively to `--fasta`, the pipeline config files come bundled with paths to the Illumina iGenomes reference index files. If running with docker or AWS, the configuration is set up to use the [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) resource.\n\nThere are 31 different species supported in the iGenomes references. To run the pipeline, you must specify which to use with the `--genome` flag.\n\nYou can find the keys to specify the genomes in the [iGenomes config file](../conf/igenomes.config). Common genomes that are supported are:\n\n- Human\n - `--genome GRCh37`\n - `--genome GRCh38`\n- Mouse *\n - `--genome GRCm38`\n- _Drosophila_ *\n - `--genome BDGP6`\n- _S. cerevisiae_ *\n - `--genome 'R64-1-1'`\n\n> \\* Not bundled with nf-core eager by default.\n\nNote that you can use the same configuration setup to save sets of reference files for your own use, even if they are not part of the iGenomes resource. See the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for instructions on where to save such a file.\n\nThe syntax for this reference configuration is as follows:\n\n```nextflow\nparams {\n genomes {\n 'GRCh37' {\n fasta = ''\n }\n // Any number of additional genomes, key is used with --genome\n }\n}\n**NB** Requires argument `--igenomes_ignore false` as iGenomes ignored by default in nf-core/eager\n\n```" }, "igenomes_base": { "type": "string", @@ -181,17 +181,6 @@ "copyNoFollow", "move" ] - }, - "saveTrimmed": { - "type": "boolean", - "default": true, - "description": "Turn this on if you want to keep trimmed reads.", - "hidden": true - }, - "saveAlignedIntermediates": { - "type": "boolean", - "description": "Turn this on if you want to keep intermediate alignment files (SAM, BAM, non-dedupped BAM)", - "hidden": true } }, "fa_icon": "fas fa-cloud-download-alt" @@ -206,13 +195,6 @@ "hidden": true, "fa_icon": "fas fa-question-circle" }, - "name": { - "type": "string", - "description": "Workflow name of run, for future reference.", - "fa_icon": "fas fa-fingerprint", - "hidden": true, - "help_text": "A custom name for the pipeline run. Unlike the core nextflow `-name` option with one hyphen this parameter can be reused multiple times, for example if using `-resume`. Passed through to steps such as MultiQC and used for things like report filenames and titles." - }, "email": { "type": "string", "description": "Email address for completion summary.", @@ -262,6 +244,38 @@ "default": "${params.outdir}/pipeline_info", "fa_icon": "fas fa-cogs", "hidden": true + }, + "show_hidden_params": { + "type": "boolean", + "fa_icon": "far fa-eye-slash", + "description": "Show all params when using `--help`", + "hidden": true, + "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." + }, + "enable_conda": { + "type": "boolean", + "hidden": true, + "description": "Parameter used for checking conda channels to be set correctly." + }, + "validate_params": { + "type": "boolean", + "default": "true", + "description": "Boolean whether to validate parameters against the schema at runtime", + "fa_icon": "fab fa-angellist", + "hidden": true + }, + "schema_ignore_params": { + "type": "string", + "fa_icon": "fas fa-not-equal", + "description": "String to specify ignored parameters for parameter validation", + "hidden": true, + "default": "genomes" + }, + "config_profile_name": { + "type": "string", + "description": "String to describe the config profile that is run.", + "fa_icon": "fas fa-id-badge", + "hidden": true } }, "fa_icon": "fas fa-file-import", @@ -472,9 +486,9 @@ }, "skip_collapse": { "type": "boolean", - "description": "Skip of merging forward and reverse reads together. Only applicable for paired-end libraries.", + "description": "Skip of merging forward and reverse reads together and turns on paired-end alignment for downstream mapping. Only applicable for paired-end libraries.", "fa_icon": "fas fa-fast-forward", - "help_text": "Turns off the paired-end read merging.\n\nFor example\n\n```bash\n--skip_collapse --input '*_{R1,R2}_*.fastq'\n```\n\nIt is important to use the paired-end wildcard globbing as `--skip_collapse` can only be used on paired-end data!\n\n:warning: If you run this and also with `--clip_readlength` set to something (as is by default), you may end up removing single reads from either the pair1 or pair2 file. These will be NOT be mapped when aligning with either `bwa` or `bowtie`, as both can only accept one (forward) or two (forward and reverse) FASTQs as input.\n\n> Modifies AdapterRemoval parameter: `--collapse`" + "help_text": "Turns off the paired-end read merging.\n\nFor example\n\n```bash\n--skip_collapse --input '*_{R1,R2}_*.fastq'\n```\n\nIt is important to use the paired-end wildcard globbing as `--skip_collapse` can only be used on paired-end data!\n\n:warning: If you run this and also with `--clip_readlength` set to something (as is by default), you may end up removing single reads from either the pair1 or pair2 file. These will be NOT be mapped when aligning with either `bwa` or `bowtie`, as both can only accept one (forward) or two (forward and reverse) FASTQs as input.\n\nAlso note that supplying this flag will then also cause downstream mapping steps to run in paired-end mode. This may be more suitable for modern data, or when you want to utilise mate-pair spatial information.\n\n> Modifies AdapterRemoval parameter: `--collapse`" }, "skip_trim": { "type": "boolean", @@ -493,6 +507,13 @@ "description": "Only use merged reads downstream (un-merged reads and singletons are discarded).", "fa_icon": "fas fa-handshake", "help_text": "Specify that only merged reads are sent downstream for analysis.\n\nSingletons (i.e. reads missing a pair), or un-merged reads (where there wasn't sufficient overlap) are discarded.\n\nYou may want to use this if you want ensure only the best quality reads for your analysis, but with the penalty of potentially losing still valid data (even if some reads have slightly lower quality). It is highly recommended when using `--dedupper 'dedup'` (see below)." + }, + "qualitymax": { + "type": "integer", + "description": "Specify the maximum Phred score used in input FASTQ files", + "help_text": "Specify maximum Phred score of the quality field of FASTQ files. The quality-score range can vary depending on the machine and version (e.g. see diagram [here](https://en.wikipedia.org/wiki/FASTQ_format#Encoding), and this allows you to increase from the default AdapterRemoval value of `41`.\n\n> Modifies AdapterRemoval parameters: `--qualitymax`", + "default": 41, + "fa_icon": "fas fa-arrow-up" } }, "fa_icon": "fas fa-cut", @@ -586,8 +607,8 @@ }, "bt2n": { "type": "integer", - "description": "Specify the -N parameter for bowtie2 (mismatches in seed). This will override defaults from alignmode/sensitivity.", "default": 0, + "description": "Specify the -N parameter for bowtie2 (mismatches in seed). This will override defaults from alignmode/sensitivity.", "fa_icon": "fas fa-sort-numeric-down", "help_text": "The number of mismatches allowed in the seed during seed-and-extend procedure of Bowtie2. This will override any values set with `--bt2_sensitivity`. Can either be 0 or 1. Default: 0 (i.e. use`--bt2_sensitivity` defaults).\n\n> Modifies Bowtie2 parameters: `-N`", "enum": [ @@ -640,7 +661,8 @@ "help_text": "Read removal mode. Remove mapped reads completely (`'remove'`) or just replace mapped reads sequence by N (`'replace'`)\n\n> Modifies extract_map_reads.py parameter: `-m`", "enum": [ "strip", - "replace" + "replace", + "remove" ] } }, @@ -661,8 +683,8 @@ }, "bam_mapping_quality_threshold": { "type": "integer", - "description": "Minimum mapping quality for reads filter.", "default": 0, + "description": "Minimum mapping quality for reads filter.", "fa_icon": "fas fa-greater-than-equal", "help_text": "Specify a mapping quality threshold for mapped reads to be kept for downstream analysis. By default keeps all reads and is therefore set to `0` (basically doesn't filter anything).\n\n> Modifies samtools view parameter: `-q`" }, @@ -1007,7 +1029,7 @@ ] }, "gatk_ug_keep_realign_bam": { - "type": "string", + "type": "boolean", "description": "Specify to keep the BAM output of re-alignment around variants from GATK UnifiedGenotyper.", "fa_icon": "fas fa-align-left", "help_text": "If provided when running GATK's UnifiedGenotyper, this will put into the output folder the BAMs that have realigned reads (with GATK's (v3) IndelRealigner) around possible variants for improved genotyping.\n\nThese BAMs will be stored in the same folder as the corresponding VCF files." @@ -1279,7 +1301,7 @@ "properties": { "run_sexdeterrmine": { "type": "boolean", - "description": "Turn on sex determination for human reference genomes.", + "description": "Turn on sex determination for human reference genomes. This will run on single- and double-stranded variants of a library separately.", "fa_icon": "fas fa-transgender-alt", "help_text": "Specify to run the optional process of sex determination.\n" }, @@ -1346,7 +1368,6 @@ "type": "string", "description": "Specify which classifier to use. Options: 'malt', 'kraken'.", "fa_icon": "fas fa-tools", - "default": "undefined", "help_text": "Specify which taxonomic classifier to use. There are two options available:\n\n- `kraken` for [Kraken2](https://ccb.jhu.edu/software/kraken2)\n- `malt` for [MALT](https://software-ab.informatik.uni-tuebingen.de/download/malt/welcome.html)\n\n:warning: **Important** It is very important to run `nextflow clean -f` on your\nNextflow run directory once completed. RMA6 files are VERY large and are\n_copied_ from a `work/` directory into the results folder. You should clean the\nwork directory with the command to ensure non-redundancy and large HDD\nfootprints!" }, "database": { @@ -1620,4 +1641,4 @@ "$ref": "#/definitions/metagenomic_authentication" } ] -} \ No newline at end of file +}