Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

### `Added`

- [#687](https://github.com/nf-core/eager/pull/687) - Adds Kraken2 unique kmer counting report
- [#676](https://github.com/nf-core/eager/issues/676) - Refactor help message / summary message formatting to automatic versions using nf-core library
- [#682](https://github.com/nf-core/eager/issues/682) - Add AdapterRemoval `--qualitymax` flag to allow FASTQ Phred score range max more than 41

Expand Down
50 changes: 36 additions & 14 deletions bin/kraken_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,24 @@ def _get_args():
default=50,
help="Minimum number of hits on clade to report it. Default = 50")
parser.add_argument(
'-o',
dest="output",
'-or',
dest="readout",
default=None,
help="Output file. Default = <basename>.kraken_parsed.csv")
help="Read count output file. Default = <basename>.read_kraken_parsed.csv")
parser.add_argument(
'-ok',
dest="kmerout",
default=None,
help="Kmer Output file. Default = <basename>.kmer_kraken_parsed.csv")

args = parser.parse_args()

infile = args.krakenReport
countlim = int(args.count)
outfile = args.output
readout = args.readout
kmerout = args.kmerout

return(infile, countlim, outfile)
return(infile, countlim, readout, kmerout)


def _get_basename(file_name):
Expand All @@ -51,14 +57,23 @@ def parse_kraken(infile, countlim):

'''
with open(infile, 'r') as f:
resdict = {}
read_dict = {}
kmer_dict = {}
csvreader = csv.reader(f, delimiter='\t')
for line in csvreader:
reads = int(line[1])
if reads >= countlim:
taxid = line[4]
resdict[taxid] = reads
return(resdict)
taxid = line[6]
kmer = line[3]
unique_kmer = line[4]
try:
kmer_duplicity = float(kmer)/float(unique_kmer)
except ZeroDivisionError:
kmer_duplicity = 0
read_dict[taxid] = reads
kmer_dict[taxid] = kmer_duplicity

return(read_dict, kmer_dict)


def write_output(resdict, infile, outfile):
Expand All @@ -70,10 +85,17 @@ def write_output(resdict, infile, outfile):


if __name__ == '__main__':
INFILE, COUNTLIM, outfile = _get_args()
INFILE, COUNTLIM, readout, kmerout = _get_args()

if not outfile:
outfile = _get_basename(INFILE)+".kraken_parsed.csv"
if not readout:
read_outfile = _get_basename(INFILE)+".read_kraken_parsed.csv"
else:
read_outfile = readout
if not kmerout:
kmer_outfile = _get_basename(INFILE)+".kmer_kraken_parsed.csv"
else:
kmer_outfile = kmerout

tmp_dict = parse_kraken(infile=INFILE, countlim=COUNTLIM)
write_output(resdict=tmp_dict, infile=INFILE, outfile=outfile)
read_dict, kmer_dict = parse_kraken(infile=INFILE, countlim=COUNTLIM)
write_output(resdict=read_dict, infile=INFILE, outfile=read_outfile)
write_output(resdict=kmer_dict, infile=INFILE, outfile=kmer_outfile)
33 changes: 21 additions & 12 deletions bin/merge_kraken_res.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,29 @@ def _get_args():
formatter_class=argparse.RawDescriptionHelpFormatter,
description='Merging csv count files in one table')
parser.add_argument(
'-o',
dest="output",
default="kraken_count_table.csv",
help="Output file. Default = kraken_count_table.csv")
'-or',
dest="readout",
default="kraken_read_count_table.csv",
help="Read count output file. Default = kraken_read_count_table.csv")
parser.add_argument(
'-ok',
dest="kmerout",
default="kraken_kmer_unicity_table.csv",
help="Kmer unicity output file. Default = kraken_kmer_unicity_table.csv")

args = parser.parse_args()

outfile = args.output
readout = args.readout
kmerout = args.kmerout

return(outfile)
return(readout, kmerout)


def get_csv():
tmp = [i for i in os.listdir() if ".csv" in i]
return(tmp)
kmer = [i for i in tmp if '.kmer_' in i]
read = [i for i in tmp if '.read_' in i]
return(read, kmer)


def _get_basename(file_name):
Expand All @@ -54,8 +62,9 @@ def write_csv(pd_dataframe, outfile):


if __name__ == "__main__":
OUTFILE = _get_args()
all_csv = get_csv()
resdf = merge_csv(all_csv)
write_csv(resdf, OUTFILE)
print(resdf)
READOUT, KMEROUT = _get_args()
reads, kmers = get_csv()
read_df = merge_csv(reads)
kmer_df = merge_csv(kmers)
write_csv(read_df, READOUT)
write_csv(kmer_df, KMEROUT)
6 changes: 5 additions & 1 deletion docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -669,7 +669,11 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir
- `metagenomic_complexity_filter` - this contains the output from filtering of input reads to metagenomic classification of low-sequence complexity reads as performed by `bbduk`. This will include the filtered FASTQ files (`*_lowcomplexityremoved.fq.gz`) and also the run-time log (`_bbduk.stats`) for each sample. **Note:** there are no sections in the MultiQC report for this module, therefore you must check the `._bbduk.stats` files to get summary statistics of the filtering.
- `metagenomic_classification/` - this contains the output for a given metagenomic classifier.
- Running MALT will contain RMA6 files that can be loaded into MEGAN6 or MaltExtract for phylogenetic visualisation of read taxonomic assignments and aDNA characteristics respectively. Additional a `malt.log` file is provided which gives additional information such as run-time, memory usage and per-sample statistics of numbers of alignments with taxonomic assignment etc. This will also include gzip SAM files if requested.
- Running kraken will contain the Kraken output and report files, as well as a merged Taxon count table.
- Running kraken will contain the Kraken output and report files, as well as a merged Taxon count table. You will also get a Kraken kmer duplication table, in a [KrakenUniq](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1568-0) fashion. This is very useful to check for breadth of coverage and detect read stacking. A small number of aligned reads (low coverage) and a kmer duplication >1 is usually a sign of read stacking, usually indicative of a false positive hit (e.g. from over-amplified libraries). *Kmer duplication is defined as: number of kmers / number of unique kmers*. You will find two kraken reports formats available:
- the `*.kreport` which is the old report format, without distinct minimizer count information, used by some tools such as [Pavian](https://github.com/fbreitwieser/pavian)
- the `*.kraken2_report` which is the new kraken report format, with the distinct minimizer count information.

Finally, the `*.kraken.out` file are the direct output of Kraken2
- `maltextract/` - this contains a `results` directory in which contains the output from MaltExtract - typically one folder for each filter type, an error and a log file. The characteristics of each node (e.g. damage, read lengths, edit distances - each in different txt formats) can be seen in each sub-folder of the filter folders. Output can be visualised either with the [HOPS postprocessing script](https://github.com/rhuebler/HOPS) or [MEx-IPA](https://github.com/jfy133/MEx-IPA)
- `consensus_sequence/` - this contains three FASTA files from VCF2Genome of a consensus sequence based on the reference FASTA with each sample's unique modifications. The main FASTA is a standard file with bases not passing the specified thresholds as Ns. The two other FASTAS (`_refmod.fasta.gz`) and (`_uncertainity.fasta.gz`) are IUPAC uncertainty codes (rather than Ns) and a special number-based uncertainty system used for other downstream tools, respectively.
- `librarymerged_bams/` - these contain the final BAM files that would go into genotyping (if genotyping is turned on). This means the files will contain all libraries of a given sample (including trimmed non-UDG or half-UDG treated libraries, if BAM trimming turned on)
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ dependencies:
- bioconda::bamutil=1.0.14
- bioconda::mtnucratio=0.7
- bioconda::pysam=0.15.4 #Says python3.7 or less
- bioconda::kraken2=2.0.9beta
- bioconda::kraken2=2.1.1
- conda-forge::pandas=1.0.4 #.4 is python3.8+ compatible
- bioconda::freebayes=1.3.2 #should be fine with python 3.8, but says <3.7 on webpage
- bioconda::sexdeterrmine=1.1.2
Expand Down
24 changes: 14 additions & 10 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2894,15 +2894,17 @@ process kraken {

output:
file "*.kraken.out" into ch_kraken_out
tuple prefix, path("*.kreport") into ch_kraken_report, ch_kraken_for_multiqc
tuple prefix, path("*.kraken2_report") into ch_kraken_report, ch_kraken_for_multiqc

script:
prefix = fastq.toString().tokenize('.')[0]
out = prefix+".kraken.out"
kreport = prefix+".kreport"
kreport = prefix+".kraken2_report"
kreport_old = prefix+".kreport"

"""
kraken2 --db ${krakendb} --threads ${task.cpus} --output $out --report $kreport $fastq
kraken2 --db ${krakendb} --threads ${task.cpus} --output $out --report-minimizer-data --report $kreport $fastq
cut -f1-3,6-8 $kreport > $kreport_old
"""
}

Expand All @@ -2914,28 +2916,30 @@ process kraken_parse {
tuple val(name), path(kraken_r) from ch_kraken_report

output:
tuple val(name), path('*.kraken_parsed.csv') into ch_kraken_parsed
path('*_kraken_parsed.csv') into ch_kraken_parsed

script:
out = name+".kraken_parsed.csv"
read_out = name+".read_kraken_parsed.csv"
kmer_out = name+".kmer_kraken_parsed.csv"
"""
kraken_parse.py -c ${params.metagenomic_min_support_reads} -o $out $kraken_r
kraken_parse.py -c ${params.metagenomic_min_support_reads} -or $read_out -ok $kmer_out $kraken_r
"""
}

process kraken_merge {
publishDir "${params.outdir}/metagenomic_classification/kraken", mode: params.publish_dir_mode

input:
file csv_count from ch_kraken_parsed.map{ it[1] }.collect()
file csv_count from ch_kraken_parsed.collect()

output:
path('kraken_count_table.csv')
path('*.csv')

script:
out = "kraken_count_table.csv"
read_out = "kraken_read_count.csv"
kmer_out = "kraken_kmer_duplication.csv"
"""
merge_kraken_res.py -o $out
merge_kraken_res.py -or $read_out -ok $kmer_out
"""
}

Expand Down