Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

### `Added`

- [#687](https://github.com/nf-core/eager/pull/687) - Adding Kraken2 unique kmer counting
Comment thread
apeltzer marked this conversation as resolved.
Outdated

### `Fixed`

- [#666](https://github.com/nf-core/eager/issues/666) - Fixed input file staging for `print_nuclear_contamination`
Expand Down
50 changes: 36 additions & 14 deletions bin/kraken_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,24 @@ def _get_args():
default=50,
help="Minimum number of hits on clade to report it. Default = 50")
parser.add_argument(
'-o',
dest="output",
'-or',
dest="readout",
default=None,
help="Output file. Default = <basename>.kraken_parsed.csv")
help="Read count output file. Default = <basename>.read_kraken_parsed.csv")
parser.add_argument(
'-ok',
dest="kmerout",
default=None,
help="Kmer Output file. Default = <basename>.kmer_kraken_parsed.csv")

args = parser.parse_args()

infile = args.krakenReport
countlim = int(args.count)
outfile = args.output
readout = args.readout
kmerout = args.kmerout

return(infile, countlim, outfile)
return(infile, countlim, readout, kmerout)


def _get_basename(file_name):
Expand All @@ -51,14 +57,23 @@ def parse_kraken(infile, countlim):

'''
with open(infile, 'r') as f:
resdict = {}
read_dict = {}
kmer_dict = {}
csvreader = csv.reader(f, delimiter='\t')
for line in csvreader:
reads = int(line[1])
if reads >= countlim:
taxid = line[4]
resdict[taxid] = reads
return(resdict)
taxid = line[6]
kmer = line[3]
unique_kmer = line[4]
try:
kmer_duplicity = float(kmer)/float(unique_kmer)
except ZeroDivisionError:
kmer_duplicity = 0
read_dict[taxid] = reads
kmer_dict[taxid] = kmer_duplicity

return(read_dict, kmer_dict)


def write_output(resdict, infile, outfile):
Expand All @@ -70,10 +85,17 @@ def write_output(resdict, infile, outfile):


if __name__ == '__main__':
INFILE, COUNTLIM, outfile = _get_args()
INFILE, COUNTLIM, readout, kmerout = _get_args()

if not outfile:
outfile = _get_basename(INFILE)+".kraken_parsed.csv"
if not readout:
read_outfile = _get_basename(INFILE)+".read_kraken_parsed.csv"
else:
read_outfile = readout
if not kmerout:
kmer_outfile = _get_basename(INFILE)+".kmer_kraken_parsed.csv"
else:
kmer_outfile = kmerout

tmp_dict = parse_kraken(infile=INFILE, countlim=COUNTLIM)
write_output(resdict=tmp_dict, infile=INFILE, outfile=outfile)
read_dict, kmer_dict = parse_kraken(infile=INFILE, countlim=COUNTLIM)
write_output(resdict=read_dict, infile=INFILE, outfile=read_outfile)
write_output(resdict=kmer_dict, infile=INFILE, outfile=kmer_outfile)
33 changes: 21 additions & 12 deletions bin/merge_kraken_res.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,29 @@ def _get_args():
formatter_class=argparse.RawDescriptionHelpFormatter,
description='Merging csv count files in one table')
parser.add_argument(
'-o',
dest="output",
default="kraken_count_table.csv",
help="Output file. Default = kraken_count_table.csv")
'-or',
dest="readout",
default="kraken_read_count_table.csv",
help="Read count output file. Default = kraken_read_count_table.csv")
parser.add_argument(
'-ok',
dest="kmerout",
default="kraken_kmer_unicity_table.csv",
help="Kmer unicity output file. Default = kraken_kmer_unicity_table.csv")

args = parser.parse_args()

outfile = args.output
readout = args.readout
kmerout = args.kmerout

return(outfile)
return(readout, kmerout)


def get_csv():
tmp = [i for i in os.listdir() if ".csv" in i]
return(tmp)
kmer = [i for i in tmp if '.kmer_' in i]
read = [i for i in tmp if '.read_' in i]
return(read, kmer)


def _get_basename(file_name):
Expand All @@ -54,8 +62,9 @@ def write_csv(pd_dataframe, outfile):


if __name__ == "__main__":
OUTFILE = _get_args()
all_csv = get_csv()
resdf = merge_csv(all_csv)
write_csv(resdf, OUTFILE)
print(resdf)
READOUT, KMEROUT = _get_args()
reads, kmers = get_csv()
read_df = merge_csv(reads)
kmer_df = merge_csv(kmers)
write_csv(read_df, READOUT)
write_csv(kmer_df, KMEROUT)
2 changes: 1 addition & 1 deletion docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -669,7 +669,7 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir
- `metagenomic_complexity_filter` - this contains the output from filtering of input reads to metagenomic classification of low-sequence complexity reads as performed by `bbduk`. This will include the filtered FASTQ files (`*_lowcomplexityremoved.fq.gz`) and also the run-time log (`_bbduk.stats`) for each sample. **Note:** there are no sections in the MultiQC report for this module, therefore you must check the `._bbduk.stats` files to get summary statistics of the filtering.
- `metagenomic_classification/` - this contains the output for a given metagenomic classifier.
- Running MALT will contain RMA6 files that can be loaded into MEGAN6 or MaltExtract for phylogenetic visualisation of read taxonomic assignments and aDNA characteristics respectively. Additional a `malt.log` file is provided which gives additional information such as run-time, memory usage and per-sample statistics of numbers of alignments with taxonomic assignment etc. This will also include gzip SAM files if requested.
- Running kraken will contain the Kraken output and report files, as well as a merged Taxon count table.
- Running kraken will contain the Kraken output and report files, as well as a merged Taxon count table. You will also get a Kraken kmer duplication table, in a [KrakenUniq](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1568-0) fashion. This is very useful to check for breadth of coverage and detect read stacking. A small number of aligned reads (low coverage) and a kmer duplication >>1 is usually a sign of read stacking, usually synonym of a false positive hit. *Kmer duplication is defined as: number of kmers / number of unique kmers*
Comment thread
apeltzer marked this conversation as resolved.
Outdated
- `maltextract/` - this contains a `results` directory in which contains the output from MaltExtract - typically one folder for each filter type, an error and a log file. The characteristics of each node (e.g. damage, read lengths, edit distances - each in different txt formats) can be seen in each sub-folder of the filter folders. Output can be visualised either with the [HOPS postprocessing script](https://github.com/rhuebler/HOPS) or [MEx-IPA](https://github.com/jfy133/MEx-IPA)
- `consensus_sequence/` - this contains three FASTA files from VCF2Genome of a consensus sequence based on the reference FASTA with each sample's unique modifications. The main FASTA is a standard file with bases not passing the specified thresholds as Ns. The two other FASTAS (`_refmod.fasta.gz`) and (`_uncertainity.fasta.gz`) are IUPAC uncertainty codes (rather than Ns) and a special number-based uncertainty system used for other downstream tools, respectively.
- `librarymerged_bams/` - these contain the final BAM files that would go into genotyping (if genotyping is turned on). This means the files will contain all libraries of a given sample (including trimmed non-UDG or half-UDG treated libraries, if BAM trimming turned on)
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ dependencies:
- bioconda::bamutil=1.0.14
- bioconda::mtnucratio=0.7
- bioconda::pysam=0.15.4 #Says python3.7 or less
- bioconda::kraken2=2.0.9beta
- bioconda::kraken2=2.1.1
- conda-forge::pandas=1.0.4 #.4 is python3.8+ compatible
- bioconda::freebayes=1.3.2 #should be fine with python 3.8, but says <3.7 on webpage
- bioconda::sexdeterrmine=1.1.2
Expand Down
38 changes: 29 additions & 9 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -3136,15 +3136,33 @@ process kraken {

output:
file "*.kraken.out" into ch_kraken_out
tuple prefix, path("*.kreport") into ch_kraken_report, ch_kraken_for_multiqc
tuple prefix, path("*.kraken2_report") into ch_kraken_report, ch_kraken_report_backward_compatibility
Comment thread
apeltzer marked this conversation as resolved.
Outdated

script:
prefix = fastq.toString().tokenize('.')[0]
out = prefix+".kraken.out"
kreport = prefix+".kraken2_report"

"""
kraken2 --db ${krakendb} --threads ${task.cpus} --output $out --report-minimizer-data --report $kreport $fastq
"""
}

process kraken_report_backward_compatibility {
tag "$prefix"
label 'sc_tiny'

input:
tuple val(prefix), path(kraken_r) from ch_kraken_report_backward_compatibility

output:
tuple prefix, path("*.kreport") into ch_kraken_for_multiqc

script:
kreport = prefix+".kreport"

"""
kraken2 --db ${krakendb} --threads ${task.cpus} --output $out --report $kreport $fastq
cut -f1-3,6-8 $kraken_r > $kreport
"""
}

Expand All @@ -3156,28 +3174,30 @@ process kraken_parse {
tuple val(name), path(kraken_r) from ch_kraken_report

output:
tuple val(name), path('*.kraken_parsed.csv') into ch_kraken_parsed
path('*_kraken_parsed.csv') into ch_kraken_parsed

script:
out = name+".kraken_parsed.csv"
read_out = name+".read_kraken_parsed.csv"
kmer_out = name+".kmer_kraken_parsed.csv"
"""
kraken_parse.py -c ${params.metagenomic_min_support_reads} -o $out $kraken_r
kraken_parse.py -c ${params.metagenomic_min_support_reads} -or $read_out -ok $kmer_out $kraken_r
"""
}

process kraken_merge {
publishDir "${params.outdir}/metagenomic_classification/kraken", mode: params.publish_dir_mode

input:
file csv_count from ch_kraken_parsed.map{ it[1] }.collect()
file csv_count from ch_kraken_parsed.collect()

output:
path('kraken_count_table.csv')
path('*.csv')

script:
out = "kraken_count_table.csv"
read_out = "kraken_read_count.csv"
kmer_out = "kraken_kmer_duplication.csv"
"""
merge_kraken_res.py -o $out
merge_kraken_res.py -or $read_out -ok $kmer_out
"""
}

Expand Down