Merge pull request #687 from maxibor/dev

apeltzer · web-flow · commit 8133006ef566 · 2021-03-12T22:16:15.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ### `Added`
 
+- [#687](https://github.com/nf-core/eager/pull/687) - Adds Kraken2 unique kmer counting report
 - [#676](https://github.com/nf-core/eager/issues/676) - Refactor help message / summary message formatting to automatic versions using nf-core library
 - [#682](https://github.com/nf-core/eager/issues/682) - Add AdapterRemoval `--qualitymax` flag to allow FASTQ Phred score range max more than 41
 
diff --git a/bin/kraken_parse.py b/bin/kraken_parse.py
@@ -19,18 +19,24 @@ def _get_args():
         default=50,
         help="Minimum number of hits on clade to report it. Default = 50")
     parser.add_argument(
-        '-o',
-        dest="output",
+        '-or',
+        dest="readout",
         default=None,
-        help="Output file. Default = <basename>.kraken_parsed.csv")
+        help="Read count output file. Default = <basename>.read_kraken_parsed.csv")
+    parser.add_argument(
+        '-ok',
+        dest="kmerout",
+        default=None,
+        help="Kmer Output file. Default = <basename>.kmer_kraken_parsed.csv")
 
     args = parser.parse_args()
 
     infile = args.krakenReport
     countlim = int(args.count)
-    outfile = args.output
+    readout = args.readout
+    kmerout = args.kmerout
 
-    return(infile, countlim, outfile)
+    return(infile, countlim, readout, kmerout)
 
 
 def _get_basename(file_name):
@@ -51,14 +57,23 @@ def parse_kraken(infile, countlim):
 
     '''
     with open(infile, 'r') as f:
-        resdict = {}
+        read_dict = {}
+        kmer_dict = {}
         csvreader = csv.reader(f, delimiter='\t')
         for line in csvreader:
             reads = int(line[1])
             if reads >= countlim:
-                taxid = line[4]
-                resdict[taxid] = reads
-        return(resdict)
+                taxid = line[6]
+                kmer = line[3]
+                unique_kmer = line[4]
+                try:
+                    kmer_duplicity = float(kmer)/float(unique_kmer)
+                except ZeroDivisionError:
+                    kmer_duplicity = 0
+                read_dict[taxid] = reads
+                kmer_dict[taxid] = kmer_duplicity
+
+        return(read_dict, kmer_dict)
 
 
 def write_output(resdict, infile, outfile):
@@ -70,10 +85,17 @@ def write_output(resdict, infile, outfile):
 
 
 if __name__ == '__main__':
-    INFILE, COUNTLIM, outfile = _get_args()
+    INFILE, COUNTLIM, readout, kmerout = _get_args()
 
-    if not outfile:
-        outfile = _get_basename(INFILE)+".kraken_parsed.csv"
+    if not readout:
+        read_outfile = _get_basename(INFILE)+".read_kraken_parsed.csv"
+    else:
+        read_outfile = readout
+    if not kmerout:    
+        kmer_outfile = _get_basename(INFILE)+".kmer_kraken_parsed.csv"
+    else:
+        kmer_outfile = kmerout
 
-    tmp_dict = parse_kraken(infile=INFILE, countlim=COUNTLIM)
-    write_output(resdict=tmp_dict, infile=INFILE, outfile=outfile)
+    read_dict, kmer_dict = parse_kraken(infile=INFILE, countlim=COUNTLIM)
+    write_output(resdict=read_dict, infile=INFILE, outfile=read_outfile)
+    write_output(resdict=kmer_dict, infile=INFILE, outfile=kmer_outfile)
diff --git a/bin/merge_kraken_res.py b/bin/merge_kraken_res.py
@@ -15,21 +15,29 @@ def _get_args():
         formatter_class=argparse.RawDescriptionHelpFormatter,
         description='Merging csv count files in one table')
     parser.add_argument(
-        '-o',
-        dest="output",
-        default="kraken_count_table.csv",
-        help="Output file. Default = kraken_count_table.csv")
+        '-or',
+        dest="readout",
+        default="kraken_read_count_table.csv",
+        help="Read count output file. Default = kraken_read_count_table.csv")
+    parser.add_argument(
+        '-ok',
+        dest="kmerout",
+        default="kraken_kmer_unicity_table.csv",
+        help="Kmer unicity output file. Default = kraken_kmer_unicity_table.csv")
 
     args = parser.parse_args()
 
-    outfile = args.output
+    readout = args.readout
+    kmerout = args.kmerout
 
-    return(outfile)
+    return(readout, kmerout)
 
 
 def get_csv():
     tmp = [i for i in os.listdir() if ".csv" in i]
-    return(tmp)
+    kmer = [i for i in tmp if '.kmer_' in i]
+    read = [i for i in tmp if '.read_' in i]
+    return(read, kmer)
 
 
 def _get_basename(file_name):
@@ -54,8 +62,9 @@ def write_csv(pd_dataframe, outfile):
 
 
 if __name__ == "__main__":
-    OUTFILE = _get_args()
-    all_csv = get_csv()
-    resdf = merge_csv(all_csv)
-    write_csv(resdf, OUTFILE)
-    print(resdf)
+    READOUT, KMEROUT = _get_args()
+    reads, kmers = get_csv()
+    read_df = merge_csv(reads)
+    kmer_df = merge_csv(kmers)
+    write_csv(read_df, READOUT)
+    write_csv(kmer_df, KMEROUT)
diff --git a/docs/output.md b/docs/output.md
@@ -671,7 +671,11 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir
 - `metagenomic_complexity_filter` - this contains the output from filtering of input reads to metagenomic classification of low-sequence complexity reads as performed by `bbduk`. This will include the filtered FASTQ files (`*_lowcomplexityremoved.fq.gz`) and also the run-time log (`_bbduk.stats`) for each sample. **Note:** there are no sections in the MultiQC report for this module, therefore you must check the `._bbduk.stats` files to get summary statistics of the filtering.
 - `metagenomic_classification/` - this contains the output for a given metagenomic classifier.
   - Running MALT will contain RMA6 files that can be loaded into MEGAN6 or MaltExtract for phylogenetic visualisation of read taxonomic assignments and aDNA characteristics respectively. Additional a `malt.log` file is provided which gives additional information such as run-time, memory usage and per-sample statistics of numbers of alignments with taxonomic assignment etc. This will also include gzip SAM files if requested.
-  - Running kraken will contain the Kraken output and report files, as well as a merged Taxon count table.
+  - Running kraken will contain the Kraken output and report files, as well as a merged Taxon count table. You will also get a Kraken kmer duplication table, in a [KrakenUniq](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1568-0) fashion. This is very useful to check for breadth of coverage and detect read stacking. A small number of aligned reads (low coverage) and a kmer duplication >1 is usually a sign of read stacking, usually indicative of a false positive hit (e.g. from over-amplified libraries). *Kmer duplication is defined as: number of kmers / number of unique kmers*. You will find two kraken reports formats available:  
+    - the `*.kreport` which is the old report format, without distinct minimizer count information, used by some tools such as [Pavian](https://github.com/fbreitwieser/pavian)
+    - the `*.kraken2_report` which is the new kraken report format, with the distinct minimizer count information.  
+  
+    Finally, the `*.kraken.out` file are the direct output of Kraken2
 - `maltextract/` - this contains a `results` directory in which contains the output from MaltExtract - typically one folder for each filter type, an error and a log file. The characteristics of each node (e.g. damage, read lengths, edit distances - each in different txt formats) can be seen in each sub-folder of the filter folders. Output can be visualised either with the [HOPS postprocessing script](https://github.com/rhuebler/HOPS) or [MEx-IPA](https://github.com/jfy133/MEx-IPA)
 - `consensus_sequence/` - this contains three FASTA files from VCF2Genome of a consensus sequence based on the reference FASTA with each sample's unique modifications. The main FASTA is a standard file with bases not passing the specified thresholds as Ns. The two other FASTAS (`_refmod.fasta.gz`) and (`_uncertainity.fasta.gz`) are IUPAC uncertainty codes (rather than Ns) and a special number-based uncertainty system used for other downstream tools, respectively.
 - `librarymerged_bams/` - these contain the final BAM files that would go into genotyping (if genotyping is turned on). This means the files will contain all libraries of a given sample (including trimmed non-UDG or half-UDG treated libraries, if BAM trimming turned on)
diff --git a/environment.yml b/environment.yml
@@ -37,7 +37,7 @@ dependencies:
   - bioconda::bamutil=1.0.14
   - bioconda::mtnucratio=0.7
   - bioconda::pysam=0.15.4 #Says python3.7 or less
-  - bioconda::kraken2=2.0.9beta
+  - bioconda::kraken2=2.1.1
   - conda-forge::pandas=1.0.4 #.4 is python3.8+ compatible
   - bioconda::freebayes=1.3.2 #should be fine with python 3.8, but says <3.7 on webpage
   - bioconda::sexdeterrmine=1.1.2
diff --git a/main.nf b/main.nf
@@ -2914,15 +2914,17 @@ process kraken {
 
   output:
   file "*.kraken.out" into ch_kraken_out
-  tuple prefix, path("*.kreport") into ch_kraken_report, ch_kraken_for_multiqc
+  tuple prefix, path("*.kraken2_report") into ch_kraken_report, ch_kraken_for_multiqc
 
   script:
   prefix = fastq.toString().tokenize('.')[0]
   out = prefix+".kraken.out"
-  kreport = prefix+".kreport"
+  kreport = prefix+".kraken2_report"
+  kreport_old = prefix+".kreport"
 
   """
-  kraken2 --db ${krakendb} --threads ${task.cpus} --output $out --report $kreport $fastq
+  kraken2 --db ${krakendb} --threads ${task.cpus} --output $out --report-minimizer-data --report $kreport $fastq
+  cut -f1-3,6-8 $kreport > $kreport_old
   """
 }
 
@@ -2934,28 +2936,30 @@ process kraken_parse {
   tuple val(name), path(kraken_r) from ch_kraken_report
 
   output:
-  tuple val(name), path('*.kraken_parsed.csv') into ch_kraken_parsed
+  path('*_kraken_parsed.csv') into ch_kraken_parsed
 
   script:
-  out = name+".kraken_parsed.csv"
+  read_out = name+".read_kraken_parsed.csv"
+  kmer_out =  name+".kmer_kraken_parsed.csv"
   """
-  kraken_parse.py -c ${params.metagenomic_min_support_reads} -o $out $kraken_r
+  kraken_parse.py -c ${params.metagenomic_min_support_reads} -or $read_out -ok $kmer_out $kraken_r
   """    
 }
 
 process kraken_merge {
   publishDir "${params.outdir}/metagenomic_classification/kraken", mode: params.publish_dir_mode
 
   input:
-  file csv_count from ch_kraken_parsed.map{ it[1] }.collect()
+  file csv_count from ch_kraken_parsed.collect()
 
   output:
-  path('kraken_count_table.csv')
+  path('*.csv')
 
   script:
-  out = "kraken_count_table.csv"
+  read_out = "kraken_read_count.csv"
+  kmer_out = "kraken_kmer_duplication.csv"
   """
-  merge_kraken_res.py -o $out
+  merge_kraken_res.py -or $read_out -ok $kmer_out
   """    
 }