diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ff6c69f5..2b172bf11 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### `Added` +- [#687](https://github.com/nf-core/eager/pull/687) - Adds Kraken2 unique kmer counting report - [#676](https://github.com/nf-core/eager/issues/676) - Refactor help message / summary message formatting to automatic versions using nf-core library - [#682](https://github.com/nf-core/eager/issues/682) - Add AdapterRemoval `--qualitymax` flag to allow FASTQ Phred score range max more than 41 diff --git a/bin/kraken_parse.py b/bin/kraken_parse.py index 7fb348c03..3cdd4e213 100755 --- a/bin/kraken_parse.py +++ b/bin/kraken_parse.py @@ -19,18 +19,24 @@ def _get_args(): default=50, help="Minimum number of hits on clade to report it. Default = 50") parser.add_argument( - '-o', - dest="output", + '-or', + dest="readout", default=None, - help="Output file. Default = .kraken_parsed.csv") + help="Read count output file. Default = .read_kraken_parsed.csv") + parser.add_argument( + '-ok', + dest="kmerout", + default=None, + help="Kmer Output file. Default = .kmer_kraken_parsed.csv") args = parser.parse_args() infile = args.krakenReport countlim = int(args.count) - outfile = args.output + readout = args.readout + kmerout = args.kmerout - return(infile, countlim, outfile) + return(infile, countlim, readout, kmerout) def _get_basename(file_name): @@ -51,14 +57,23 @@ def parse_kraken(infile, countlim): ''' with open(infile, 'r') as f: - resdict = {} + read_dict = {} + kmer_dict = {} csvreader = csv.reader(f, delimiter='\t') for line in csvreader: reads = int(line[1]) if reads >= countlim: - taxid = line[4] - resdict[taxid] = reads - return(resdict) + taxid = line[6] + kmer = line[3] + unique_kmer = line[4] + try: + kmer_duplicity = float(kmer)/float(unique_kmer) + except ZeroDivisionError: + kmer_duplicity = 0 + read_dict[taxid] = reads + kmer_dict[taxid] = kmer_duplicity + + return(read_dict, kmer_dict) def write_output(resdict, infile, outfile): @@ -70,10 +85,17 @@ def write_output(resdict, infile, outfile): if __name__ == '__main__': - INFILE, COUNTLIM, outfile = _get_args() + INFILE, COUNTLIM, readout, kmerout = _get_args() - if not outfile: - outfile = _get_basename(INFILE)+".kraken_parsed.csv" + if not readout: + read_outfile = _get_basename(INFILE)+".read_kraken_parsed.csv" + else: + read_outfile = readout + if not kmerout: + kmer_outfile = _get_basename(INFILE)+".kmer_kraken_parsed.csv" + else: + kmer_outfile = kmerout - tmp_dict = parse_kraken(infile=INFILE, countlim=COUNTLIM) - write_output(resdict=tmp_dict, infile=INFILE, outfile=outfile) + read_dict, kmer_dict = parse_kraken(infile=INFILE, countlim=COUNTLIM) + write_output(resdict=read_dict, infile=INFILE, outfile=read_outfile) + write_output(resdict=kmer_dict, infile=INFILE, outfile=kmer_outfile) diff --git a/bin/merge_kraken_res.py b/bin/merge_kraken_res.py index c6c16e4c1..53e45a63d 100755 --- a/bin/merge_kraken_res.py +++ b/bin/merge_kraken_res.py @@ -15,21 +15,29 @@ def _get_args(): formatter_class=argparse.RawDescriptionHelpFormatter, description='Merging csv count files in one table') parser.add_argument( - '-o', - dest="output", - default="kraken_count_table.csv", - help="Output file. Default = kraken_count_table.csv") + '-or', + dest="readout", + default="kraken_read_count_table.csv", + help="Read count output file. Default = kraken_read_count_table.csv") + parser.add_argument( + '-ok', + dest="kmerout", + default="kraken_kmer_unicity_table.csv", + help="Kmer unicity output file. Default = kraken_kmer_unicity_table.csv") args = parser.parse_args() - outfile = args.output + readout = args.readout + kmerout = args.kmerout - return(outfile) + return(readout, kmerout) def get_csv(): tmp = [i for i in os.listdir() if ".csv" in i] - return(tmp) + kmer = [i for i in tmp if '.kmer_' in i] + read = [i for i in tmp if '.read_' in i] + return(read, kmer) def _get_basename(file_name): @@ -54,8 +62,9 @@ def write_csv(pd_dataframe, outfile): if __name__ == "__main__": - OUTFILE = _get_args() - all_csv = get_csv() - resdf = merge_csv(all_csv) - write_csv(resdf, OUTFILE) - print(resdf) + READOUT, KMEROUT = _get_args() + reads, kmers = get_csv() + read_df = merge_csv(reads) + kmer_df = merge_csv(kmers) + write_csv(read_df, READOUT) + write_csv(kmer_df, KMEROUT) \ No newline at end of file diff --git a/docs/output.md b/docs/output.md index ff4f13ee2..412701660 100644 --- a/docs/output.md +++ b/docs/output.md @@ -669,7 +669,11 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir - `metagenomic_complexity_filter` - this contains the output from filtering of input reads to metagenomic classification of low-sequence complexity reads as performed by `bbduk`. This will include the filtered FASTQ files (`*_lowcomplexityremoved.fq.gz`) and also the run-time log (`_bbduk.stats`) for each sample. **Note:** there are no sections in the MultiQC report for this module, therefore you must check the `._bbduk.stats` files to get summary statistics of the filtering. - `metagenomic_classification/` - this contains the output for a given metagenomic classifier. - Running MALT will contain RMA6 files that can be loaded into MEGAN6 or MaltExtract for phylogenetic visualisation of read taxonomic assignments and aDNA characteristics respectively. Additional a `malt.log` file is provided which gives additional information such as run-time, memory usage and per-sample statistics of numbers of alignments with taxonomic assignment etc. This will also include gzip SAM files if requested. - - Running kraken will contain the Kraken output and report files, as well as a merged Taxon count table. + - Running kraken will contain the Kraken output and report files, as well as a merged Taxon count table. You will also get a Kraken kmer duplication table, in a [KrakenUniq](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1568-0) fashion. This is very useful to check for breadth of coverage and detect read stacking. A small number of aligned reads (low coverage) and a kmer duplication >1 is usually a sign of read stacking, usually indicative of a false positive hit (e.g. from over-amplified libraries). *Kmer duplication is defined as: number of kmers / number of unique kmers*. You will find two kraken reports formats available: + - the `*.kreport` which is the old report format, without distinct minimizer count information, used by some tools such as [Pavian](https://github.com/fbreitwieser/pavian) + - the `*.kraken2_report` which is the new kraken report format, with the distinct minimizer count information. + + Finally, the `*.kraken.out` file are the direct output of Kraken2 - `maltextract/` - this contains a `results` directory in which contains the output from MaltExtract - typically one folder for each filter type, an error and a log file. The characteristics of each node (e.g. damage, read lengths, edit distances - each in different txt formats) can be seen in each sub-folder of the filter folders. Output can be visualised either with the [HOPS postprocessing script](https://github.com/rhuebler/HOPS) or [MEx-IPA](https://github.com/jfy133/MEx-IPA) - `consensus_sequence/` - this contains three FASTA files from VCF2Genome of a consensus sequence based on the reference FASTA with each sample's unique modifications. The main FASTA is a standard file with bases not passing the specified thresholds as Ns. The two other FASTAS (`_refmod.fasta.gz`) and (`_uncertainity.fasta.gz`) are IUPAC uncertainty codes (rather than Ns) and a special number-based uncertainty system used for other downstream tools, respectively. - `librarymerged_bams/` - these contain the final BAM files that would go into genotyping (if genotyping is turned on). This means the files will contain all libraries of a given sample (including trimmed non-UDG or half-UDG treated libraries, if BAM trimming turned on) diff --git a/environment.yml b/environment.yml index fdddd5da5..76fba2aa0 100644 --- a/environment.yml +++ b/environment.yml @@ -37,7 +37,7 @@ dependencies: - bioconda::bamutil=1.0.14 - bioconda::mtnucratio=0.7 - bioconda::pysam=0.15.4 #Says python3.7 or less - - bioconda::kraken2=2.0.9beta + - bioconda::kraken2=2.1.1 - conda-forge::pandas=1.0.4 #.4 is python3.8+ compatible - bioconda::freebayes=1.3.2 #should be fine with python 3.8, but says <3.7 on webpage - bioconda::sexdeterrmine=1.1.2 diff --git a/main.nf b/main.nf index 081bbd33a..f8324d869 100644 --- a/main.nf +++ b/main.nf @@ -2894,15 +2894,17 @@ process kraken { output: file "*.kraken.out" into ch_kraken_out - tuple prefix, path("*.kreport") into ch_kraken_report, ch_kraken_for_multiqc + tuple prefix, path("*.kraken2_report") into ch_kraken_report, ch_kraken_for_multiqc script: prefix = fastq.toString().tokenize('.')[0] out = prefix+".kraken.out" - kreport = prefix+".kreport" + kreport = prefix+".kraken2_report" + kreport_old = prefix+".kreport" """ - kraken2 --db ${krakendb} --threads ${task.cpus} --output $out --report $kreport $fastq + kraken2 --db ${krakendb} --threads ${task.cpus} --output $out --report-minimizer-data --report $kreport $fastq + cut -f1-3,6-8 $kreport > $kreport_old """ } @@ -2914,12 +2916,13 @@ process kraken_parse { tuple val(name), path(kraken_r) from ch_kraken_report output: - tuple val(name), path('*.kraken_parsed.csv') into ch_kraken_parsed + path('*_kraken_parsed.csv') into ch_kraken_parsed script: - out = name+".kraken_parsed.csv" + read_out = name+".read_kraken_parsed.csv" + kmer_out = name+".kmer_kraken_parsed.csv" """ - kraken_parse.py -c ${params.metagenomic_min_support_reads} -o $out $kraken_r + kraken_parse.py -c ${params.metagenomic_min_support_reads} -or $read_out -ok $kmer_out $kraken_r """ } @@ -2927,15 +2930,16 @@ process kraken_merge { publishDir "${params.outdir}/metagenomic_classification/kraken", mode: params.publish_dir_mode input: - file csv_count from ch_kraken_parsed.map{ it[1] }.collect() + file csv_count from ch_kraken_parsed.collect() output: - path('kraken_count_table.csv') + path('*.csv') script: - out = "kraken_count_table.csv" + read_out = "kraken_read_count.csv" + kmer_out = "kraken_kmer_duplication.csv" """ - merge_kraken_res.py -o $out + merge_kraken_res.py -or $read_out -ok $kmer_out """ }