nf-core · TCLamnidis · Oct 6, 2020 · Sep 28, 2020 · Sep 28, 2020 · Sep 28, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -65,6 +65,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 * Added Sequencetools (1.4.0.6) that adds the ability to do genotyping with the 'pileupCaller'
 * Latest version of DeDup (0.12.6) which now reports mapped reads after deduplication
+* [#560] Latest version of Dedup (0.12.7), which now correctly reports deduplication statistics based on calculations of mapped reads only (prior denominator was total reads of BAM file)
 * Latest version of ANGSD (0.933) which doesn't seg fault when running contamination on BAMs with insufficient reads
 * Latest version of MultiQC (1.9) with support for lots of extra tools in the pipeline (MALT, SexDetERRmine, DamageProfiler, MultiVCFAnalyzer)
 * Latest versions of Pygments (7.1), Pymdown-Extensions (2.6.1) and Markdown (3.2.2) for documentation output

diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml
@@ -177,6 +177,9 @@ table_columns_visible:
         Method2_MOM_SE: False
         Method2_ML_estimate: False
         Method2_ML_SE: False
+    snp_coverage:
+        Covered_Snps: True
+        Total_Snps: False
 
 table_columns_placement:
     FastQC (pre-AdapterRemoval):
@@ -220,6 +223,9 @@ table_columns_placement:
         Method2_MOM_SE: 1160
         Method2_ML_estimate: 1170
         Method2_ML_SE: 1180
+    snp_coverage:
+        Covered_Snps: 1050
+        Total_Snps: 1060
     DeDup:
         mapped_after_dedup: 620
         clusterfactor: 630

diff --git a/bin/parse_snp_cov.py b/bin/parse_snp_cov.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+import sys, json
+from collections import OrderedDict
+
+jsonOut = OrderedDict()
+data = OrderedDict()
+
+
+input = open(sys.argv[1], 'r')
+for line in input:
+  fields = line.strip().split()
+  sample_id = fields[0]
+  covered_snps = fields[1]
+  total_snps = fields[2]
+  if sample_id[0] == "#":
+    continue
+
+  data[sample_id] = {"Covered_Snps":covered_snps, "Total_Snps":total_snps}
+
+jsonOut = {"plot_type": "generalstats", "id": "snp_coverage",
+    "pconfig": {
+        "Covered_Snps" : {"title" : "#SNPs Covered"},
+        "Total_Snps" : {"title": "#SNPs Total"}
+    }, 
+    "data" : data
+}
+
+with open(sys.argv[1].rstrip('.txt')+'_mqc.json', 'w') as outfile:
+    json.dump(jsonOut, outfile)
diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py
@@ -34,7 +34,8 @@
     'MTNucRatioCalculator':['v_mtnucratiocalculator.txt',r"Version: (\S+)"],
     'VCF2genome':['v_vcf2genome.txt', r"VCF2Genome \(v. ([0-9].[0-9]+) "],
     'endorS.py':['v_endorSpy.txt', r"endorS.py (\S+)"],
-    'kraken':['v_kraken.txt', r"Kraken version (\S+)"]
+    'kraken':['v_kraken.txt', r"Kraken version (\S+)"],
+    'eigenstrat_snp_coverage':['v_eigenstrat_snp_coverage.txt',r"(\S+)"]
 }
 
 results = OrderedDict()
@@ -69,6 +70,7 @@
 results['malt'] = '<span style="color:#999999;\">N/A</span>'
 results['kraken'] = '<span style="color:#999999;\">N/A</span>'
 results['maltextract'] = '<span style="color:#999999;\">N/A</span>'
+results['eigenstrat_snp_coverage'] = '<span style="color:#999999;\">N/A</span>'
 
 # Search each file using its regex
 for k, v in regexes.items():

diff --git a/environment.yml b/environment.yml
@@ -16,7 +16,7 @@ dependencies:
   - bioconda::bwa=0.7.17
   - bioconda::picard=2.22.9
   - bioconda::samtools=1.9
-  - bioconda::dedup=0.12.6
+  - bioconda::dedup=0.12.7
   - bioconda::angsd=0.933
   - bioconda::circularmapper=1.93.5
   - bioconda::gatk4=4.1.7.0
@@ -33,7 +33,7 @@ dependencies:
   - bioconda::fastp=0.20.1
   - bioconda::bamutil=1.0.14
   - bioconda::mtnucratio=0.7
-  - pysam=0.15.4 #Says python3.7 or less
+  - bioconda::pysam=0.15.4 #Says python3.7 or less
   - bioconda::kraken2=2.0.9beta
   - conda-forge::pandas=1.0.4 #.4 is python3.8+ compatible
   - bioconda::freebayes=1.3.2 #should be fine with python 3.8, but says <3.7 on webpage
@@ -43,4 +43,5 @@ dependencies:
   - conda-forge::biopython=1.76
   - conda-forge::xopen=0.9.0
   - bioconda::bowtie2=2.4.1
+  - bioconda::eigenstratdatabasetools=1.0.2
   #Missing Schmutzi,snpAD
diff --git a/main.nf b/main.nf
@@ -2599,7 +2599,7 @@ if (params.pileupcaller_snpfile.isEmpty ()) {
   path(snp) from ch_snp_for_pileupcaller.collect().dump(tag: "Pileupcaller SNP file")
 
   output:
-  tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("pileupcaller.${strandedness}.*")
+  tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("pileupcaller.${strandedness}.*") into ch_for_eigenstrat_snp_coverage
 
   script:
   def use_bed = bed.getName() != 'nf-core_eager_dummy.txt' ? "-l ${bed}" : ''
@@ -2614,7 +2614,32 @@ if (params.pileupcaller_snpfile.isEmpty ()) {
   samtools mpileup -B -q 30 -Q 30 ${use_bed} -f ${fasta} ${bam_list} | pileupCaller ${caller} ${ssmode} ${transitions_mode} --sampleNames ${sample_names} ${use_snp} -e pileupcaller.${strandedness}
   """
  }
-
+
+ process eigenstrat_snp_coverage {
+   label 'mc_tiny'
+   tag "${strandedness}"
+   publishDir "${params.outdir}/genotyping", mode: params.publish_dir_mode
+
+   when:
+   params.run_genotyping && params.genotyping_tool == 'pileupcaller'
+
+   input:
+   tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*") from ch_for_eigenstrat_snp_coverage.dump()
+
+   output:
+   tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*.json") into ch_eigenstrat_snp_cov_for_multiqc
+   path("*_eigenstrat_coverage.txt")
+
+   script:
+/*   """
+   eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt -j ${strandedness}_eigenstrat_coverage_mqc.json
+   """*/
+   """
+   eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt
+   parse_snp_cov.py ${strandedness}_eigenstrat_coverage.txt
+   """
+ }
+
  process genotyping_angsd {
   label 'mc_small'
   tag "${samplename}"
@@ -3139,6 +3164,7 @@ process get_software_versions {
     endorS.py --version &> v_endorSpy.txt || true
     pileupCaller --version &> v_sequencetools.txt 2>&1 || true
     bowtie2 --version | grep -a 'bowtie2-.* -fdebug' > v_bowtie2.txt || true
+    eigenstrat_snp_coverage --version | cut -d ' ' -f2 >v_eigenstrat_snp_coverage.txt || true
 
     scrape_software_versions.py &> software_versions_mqc.yaml
     """
@@ -3176,6 +3202,7 @@ process multiqc {
     file ('kraken/*') from ch_kraken_for_multiqc.collect().ifEmpty([])
     file ('hops/*') from ch_hops_for_multiqc.collect().ifEmpty([])
     file ('nuclear_contamination/*') from ch_nuclear_contamination_for_multiqc.collect().ifEmpty([])
+    file ('genotyping/*') from ch_eigenstrat_snp_cov_for_multiqc
 
     file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml")