Skip to content

Commit 8133006

Browse files
authored
Merge pull request #687 from maxibor/dev
2 parents 631e592 + 2a25cf7 commit 8133006

6 files changed

Lines changed: 78 additions & 38 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
77

88
### `Added`
99

10+
- [#687](https://github.com/nf-core/eager/pull/687) - Adds Kraken2 unique kmer counting report
1011
- [#676](https://github.com/nf-core/eager/issues/676) - Refactor help message / summary message formatting to automatic versions using nf-core library
1112
- [#682](https://github.com/nf-core/eager/issues/682) - Add AdapterRemoval `--qualitymax` flag to allow FASTQ Phred score range max more than 41
1213

bin/kraken_parse.py

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,24 @@ def _get_args():
1919
default=50,
2020
help="Minimum number of hits on clade to report it. Default = 50")
2121
parser.add_argument(
22-
'-o',
23-
dest="output",
22+
'-or',
23+
dest="readout",
2424
default=None,
25-
help="Output file. Default = <basename>.kraken_parsed.csv")
25+
help="Read count output file. Default = <basename>.read_kraken_parsed.csv")
26+
parser.add_argument(
27+
'-ok',
28+
dest="kmerout",
29+
default=None,
30+
help="Kmer Output file. Default = <basename>.kmer_kraken_parsed.csv")
2631

2732
args = parser.parse_args()
2833

2934
infile = args.krakenReport
3035
countlim = int(args.count)
31-
outfile = args.output
36+
readout = args.readout
37+
kmerout = args.kmerout
3238

33-
return(infile, countlim, outfile)
39+
return(infile, countlim, readout, kmerout)
3440

3541

3642
def _get_basename(file_name):
@@ -51,14 +57,23 @@ def parse_kraken(infile, countlim):
5157
5258
'''
5359
with open(infile, 'r') as f:
54-
resdict = {}
60+
read_dict = {}
61+
kmer_dict = {}
5562
csvreader = csv.reader(f, delimiter='\t')
5663
for line in csvreader:
5764
reads = int(line[1])
5865
if reads >= countlim:
59-
taxid = line[4]
60-
resdict[taxid] = reads
61-
return(resdict)
66+
taxid = line[6]
67+
kmer = line[3]
68+
unique_kmer = line[4]
69+
try:
70+
kmer_duplicity = float(kmer)/float(unique_kmer)
71+
except ZeroDivisionError:
72+
kmer_duplicity = 0
73+
read_dict[taxid] = reads
74+
kmer_dict[taxid] = kmer_duplicity
75+
76+
return(read_dict, kmer_dict)
6277

6378

6479
def write_output(resdict, infile, outfile):
@@ -70,10 +85,17 @@ def write_output(resdict, infile, outfile):
7085

7186

7287
if __name__ == '__main__':
73-
INFILE, COUNTLIM, outfile = _get_args()
88+
INFILE, COUNTLIM, readout, kmerout = _get_args()
7489

75-
if not outfile:
76-
outfile = _get_basename(INFILE)+".kraken_parsed.csv"
90+
if not readout:
91+
read_outfile = _get_basename(INFILE)+".read_kraken_parsed.csv"
92+
else:
93+
read_outfile = readout
94+
if not kmerout:
95+
kmer_outfile = _get_basename(INFILE)+".kmer_kraken_parsed.csv"
96+
else:
97+
kmer_outfile = kmerout
7798

78-
tmp_dict = parse_kraken(infile=INFILE, countlim=COUNTLIM)
79-
write_output(resdict=tmp_dict, infile=INFILE, outfile=outfile)
99+
read_dict, kmer_dict = parse_kraken(infile=INFILE, countlim=COUNTLIM)
100+
write_output(resdict=read_dict, infile=INFILE, outfile=read_outfile)
101+
write_output(resdict=kmer_dict, infile=INFILE, outfile=kmer_outfile)

bin/merge_kraken_res.py

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,21 +15,29 @@ def _get_args():
1515
formatter_class=argparse.RawDescriptionHelpFormatter,
1616
description='Merging csv count files in one table')
1717
parser.add_argument(
18-
'-o',
19-
dest="output",
20-
default="kraken_count_table.csv",
21-
help="Output file. Default = kraken_count_table.csv")
18+
'-or',
19+
dest="readout",
20+
default="kraken_read_count_table.csv",
21+
help="Read count output file. Default = kraken_read_count_table.csv")
22+
parser.add_argument(
23+
'-ok',
24+
dest="kmerout",
25+
default="kraken_kmer_unicity_table.csv",
26+
help="Kmer unicity output file. Default = kraken_kmer_unicity_table.csv")
2227

2328
args = parser.parse_args()
2429

25-
outfile = args.output
30+
readout = args.readout
31+
kmerout = args.kmerout
2632

27-
return(outfile)
33+
return(readout, kmerout)
2834

2935

3036
def get_csv():
3137
tmp = [i for i in os.listdir() if ".csv" in i]
32-
return(tmp)
38+
kmer = [i for i in tmp if '.kmer_' in i]
39+
read = [i for i in tmp if '.read_' in i]
40+
return(read, kmer)
3341

3442

3543
def _get_basename(file_name):
@@ -54,8 +62,9 @@ def write_csv(pd_dataframe, outfile):
5462

5563

5664
if __name__ == "__main__":
57-
OUTFILE = _get_args()
58-
all_csv = get_csv()
59-
resdf = merge_csv(all_csv)
60-
write_csv(resdf, OUTFILE)
61-
print(resdf)
65+
READOUT, KMEROUT = _get_args()
66+
reads, kmers = get_csv()
67+
read_df = merge_csv(reads)
68+
kmer_df = merge_csv(kmers)
69+
write_csv(read_df, READOUT)
70+
write_csv(kmer_df, KMEROUT)

docs/output.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -671,7 +671,11 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir
671671
- `metagenomic_complexity_filter` - this contains the output from filtering of input reads to metagenomic classification of low-sequence complexity reads as performed by `bbduk`. This will include the filtered FASTQ files (`*_lowcomplexityremoved.fq.gz`) and also the run-time log (`_bbduk.stats`) for each sample. **Note:** there are no sections in the MultiQC report for this module, therefore you must check the `._bbduk.stats` files to get summary statistics of the filtering.
672672
- `metagenomic_classification/` - this contains the output for a given metagenomic classifier.
673673
- Running MALT will contain RMA6 files that can be loaded into MEGAN6 or MaltExtract for phylogenetic visualisation of read taxonomic assignments and aDNA characteristics respectively. Additional a `malt.log` file is provided which gives additional information such as run-time, memory usage and per-sample statistics of numbers of alignments with taxonomic assignment etc. This will also include gzip SAM files if requested.
674-
- Running kraken will contain the Kraken output and report files, as well as a merged Taxon count table.
674+
- Running kraken will contain the Kraken output and report files, as well as a merged Taxon count table. You will also get a Kraken kmer duplication table, in a [KrakenUniq](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1568-0) fashion. This is very useful to check for breadth of coverage and detect read stacking. A small number of aligned reads (low coverage) and a kmer duplication >1 is usually a sign of read stacking, usually indicative of a false positive hit (e.g. from over-amplified libraries). *Kmer duplication is defined as: number of kmers / number of unique kmers*. You will find two kraken reports formats available:
675+
- the `*.kreport` which is the old report format, without distinct minimizer count information, used by some tools such as [Pavian](https://github.com/fbreitwieser/pavian)
676+
- the `*.kraken2_report` which is the new kraken report format, with the distinct minimizer count information.
677+
678+
Finally, the `*.kraken.out` file are the direct output of Kraken2
675679
- `maltextract/` - this contains a `results` directory in which contains the output from MaltExtract - typically one folder for each filter type, an error and a log file. The characteristics of each node (e.g. damage, read lengths, edit distances - each in different txt formats) can be seen in each sub-folder of the filter folders. Output can be visualised either with the [HOPS postprocessing script](https://github.com/rhuebler/HOPS) or [MEx-IPA](https://github.com/jfy133/MEx-IPA)
676680
- `consensus_sequence/` - this contains three FASTA files from VCF2Genome of a consensus sequence based on the reference FASTA with each sample's unique modifications. The main FASTA is a standard file with bases not passing the specified thresholds as Ns. The two other FASTAS (`_refmod.fasta.gz`) and (`_uncertainity.fasta.gz`) are IUPAC uncertainty codes (rather than Ns) and a special number-based uncertainty system used for other downstream tools, respectively.
677681
- `librarymerged_bams/` - these contain the final BAM files that would go into genotyping (if genotyping is turned on). This means the files will contain all libraries of a given sample (including trimmed non-UDG or half-UDG treated libraries, if BAM trimming turned on)

environment.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ dependencies:
3737
- bioconda::bamutil=1.0.14
3838
- bioconda::mtnucratio=0.7
3939
- bioconda::pysam=0.15.4 #Says python3.7 or less
40-
- bioconda::kraken2=2.0.9beta
40+
- bioconda::kraken2=2.1.1
4141
- conda-forge::pandas=1.0.4 #.4 is python3.8+ compatible
4242
- bioconda::freebayes=1.3.2 #should be fine with python 3.8, but says <3.7 on webpage
4343
- bioconda::sexdeterrmine=1.1.2

main.nf

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2914,15 +2914,17 @@ process kraken {
29142914

29152915
output:
29162916
file "*.kraken.out" into ch_kraken_out
2917-
tuple prefix, path("*.kreport") into ch_kraken_report, ch_kraken_for_multiqc
2917+
tuple prefix, path("*.kraken2_report") into ch_kraken_report, ch_kraken_for_multiqc
29182918

29192919
script:
29202920
prefix = fastq.toString().tokenize('.')[0]
29212921
out = prefix+".kraken.out"
2922-
kreport = prefix+".kreport"
2922+
kreport = prefix+".kraken2_report"
2923+
kreport_old = prefix+".kreport"
29232924

29242925
"""
2925-
kraken2 --db ${krakendb} --threads ${task.cpus} --output $out --report $kreport $fastq
2926+
kraken2 --db ${krakendb} --threads ${task.cpus} --output $out --report-minimizer-data --report $kreport $fastq
2927+
cut -f1-3,6-8 $kreport > $kreport_old
29262928
"""
29272929
}
29282930

@@ -2934,28 +2936,30 @@ process kraken_parse {
29342936
tuple val(name), path(kraken_r) from ch_kraken_report
29352937

29362938
output:
2937-
tuple val(name), path('*.kraken_parsed.csv') into ch_kraken_parsed
2939+
path('*_kraken_parsed.csv') into ch_kraken_parsed
29382940

29392941
script:
2940-
out = name+".kraken_parsed.csv"
2942+
read_out = name+".read_kraken_parsed.csv"
2943+
kmer_out = name+".kmer_kraken_parsed.csv"
29412944
"""
2942-
kraken_parse.py -c ${params.metagenomic_min_support_reads} -o $out $kraken_r
2945+
kraken_parse.py -c ${params.metagenomic_min_support_reads} -or $read_out -ok $kmer_out $kraken_r
29432946
"""
29442947
}
29452948

29462949
process kraken_merge {
29472950
publishDir "${params.outdir}/metagenomic_classification/kraken", mode: params.publish_dir_mode
29482951

29492952
input:
2950-
file csv_count from ch_kraken_parsed.map{ it[1] }.collect()
2953+
file csv_count from ch_kraken_parsed.collect()
29512954

29522955
output:
2953-
path('kraken_count_table.csv')
2956+
path('*.csv')
29542957

29552958
script:
2956-
out = "kraken_count_table.csv"
2959+
read_out = "kraken_read_count.csv"
2960+
kmer_out = "kraken_kmer_duplication.csv"
29572961
"""
2958-
merge_kraken_res.py -o $out
2962+
merge_kraken_res.py -or $read_out -ok $kmer_out
29592963
"""
29602964
}
29612965

0 commit comments

Comments
 (0)