Merge pull request #565 from nf-core/dedup

TCLamnidis · web-flow · commit aff33adcd8f5 · 2020-10-06T18:55:43.000+02:00
Bump DeDup for stats fix and DamageProfiler to latest version
Add eigenstrat snp coverage statistics.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -35,6 +35,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 * [#544](https://github.com/nf-core/eager/pull/544) Add script to perform bam filtering on fragment length
 * [#456](https://github.com/nf-core/eager/pull/546) Bumps the base (default) runtime of all processes to 4 hours, and set shorter timelimits for test profiles (1 hour)
 * [#552](https://github.com/nf-core/eager/issues/552) Adds optional creation of MALT SAM files alongside RMA6 files.
+* Added eigenstrat snp coverage statistics to MultiQC report. Process results are published in `genotyping/*_eigenstrat_coverage.txt`.
 
 ### `Fixed`
 
@@ -67,6 +68,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 * Added Sequencetools (1.4.0.6) that adds the ability to do genotyping with the 'pileupCaller'
 * Latest version of DeDup (0.12.6) which now reports mapped reads after deduplication
+* [#560] Latest version of Dedup (0.12.7), which now correctly reports deduplication statistics based on calculations of mapped reads only (prior denominator was total reads of BAM file)
 * Latest version of ANGSD (0.933) which doesn't seg fault when running contamination on BAMs with insufficient reads
 * Latest version of MultiQC (1.9) with support for lots of extra tools in the pipeline (MALT, SexDetERRmine, DamageProfiler, MultiVCFAnalyzer)
 * Latest versions of Pygments (7.1), Pymdown-Extensions (2.6.1) and Markdown (3.2.2) for documentation output
diff --git a/README.md b/README.md
@@ -195,6 +195,7 @@ If you've contributed and you're missing in here, please let us know and we will
 * **endorS.py** Aida Andrades Valtueña (Unpublished). Download: [https://github.com/aidaanva/endorS.py](https://github.com/aidaanva/endorS.py)
 * **Bowtie2**  Langmead, B. and Salzberg, S. L. 2012 Fast gapped-read alignment with Bowtie 2. Nature methods, 9(4), p. 357–359. doi: [10.1038/nmeth.1923](https:/dx.doi.org/10.1038/nmeth.1923).
 * **sequenceTools** Stephan Schiffels (Unpublished). Download: [https://github.com/stschiff/sequenceTools](https://github.com/stschiff/sequenceTools)
+* **EigenstratDatabaseTools** Thiseas C. Lamnidis (Unpublished). Download: [https://github.com/TCLamnidis/EigenStratDatabaseTools.git](https://github.com/TCLamnidis/EigenStratDatabaseTools.git)
 
 ## Data References
 
diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml
@@ -177,6 +177,9 @@ table_columns_visible:
         Method2_MOM_SE: False
         Method2_ML_estimate: False
         Method2_ML_SE: False
+    snp_coverage:
+        Covered_Snps: True
+        Total_Snps: False
 
 table_columns_placement:
     FastQC (pre-AdapterRemoval):
@@ -220,6 +223,9 @@ table_columns_placement:
         Method2_MOM_SE: 1160
         Method2_ML_estimate: 1170
         Method2_ML_SE: 1180
+    snp_coverage:
+        Covered_Snps: 1050
+        Total_Snps: 1060
     DeDup:
         mapped_after_dedup: 620
         clusterfactor: 630
diff --git a/bin/parse_snp_cov.py b/bin/parse_snp_cov.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+import sys, json
+from collections import OrderedDict
+
+jsonOut = OrderedDict()
+data = OrderedDict()
+
+
+input = open(sys.argv[1], 'r')
+for line in input:
+  fields = line.strip().split()
+  sample_id = fields[0]
+  covered_snps = fields[1]
+  total_snps = fields[2]
+  if sample_id[0] == "#":
+    continue
+  
+  data[sample_id] = {"Covered_Snps":covered_snps, "Total_Snps":total_snps}
+
+jsonOut = {"plot_type": "generalstats", "id": "snp_coverage",
+    "pconfig": {
+        "Covered_Snps" : {"title" : "#SNPs Covered"},
+        "Total_Snps" : {"title": "#SNPs Total"}
+    }, 
+    "data" : data
+}
+
+with open(sys.argv[1].rstrip('.txt')+'_mqc.json', 'w') as outfile:
+    json.dump(jsonOut, outfile)
diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py
@@ -34,7 +34,8 @@
     'MTNucRatioCalculator':['v_mtnucratiocalculator.txt',r"Version: (\S+)"],
     'VCF2genome':['v_vcf2genome.txt', r"VCF2Genome \(v. ([0-9].[0-9]+) "],
     'endorS.py':['v_endorSpy.txt', r"endorS.py (\S+)"],
-    'kraken':['v_kraken.txt', r"Kraken version (\S+)"]
+    'kraken':['v_kraken.txt', r"Kraken version (\S+)"],
+    'eigenstrat_snp_coverage':['v_eigenstrat_snp_coverage.txt',r"(\S+)"]
 }
 
 results = OrderedDict()
@@ -69,6 +70,7 @@
 results['malt'] = '<span style="color:#999999;\">N/A</span>'
 results['kraken'] = '<span style="color:#999999;\">N/A</span>'
 results['maltextract'] = '<span style="color:#999999;\">N/A</span>'
+results['eigenstrat_snp_coverage'] = '<span style="color:#999999;\">N/A</span>'
 
 # Search each file using its regex
 for k, v in regexes.items():
diff --git a/docs/output.md b/docs/output.md
@@ -160,6 +160,8 @@ The possible columns displayed by default are as follows:
 - **MT to Nuclear Ratio** This from MTtoNucRatio. This reports the number of reads aligned to a mitochondrial entry in your reference FASTA to all other entries. This will typically be high but will vary depending on tissue type.
 - **XRate** This is from Sex.DetERRmine. This is the relative depth of coverage on the X-chromosome.
 - **YRate** This is from Sex.DetERRmine. This is the relative depth of coverage on the Y-chromosome.
+- **#SNPs Covered** This is from eigenstrat\_snp\_coverage. The number of called SNPs after genotyping with pileupcaller.
+- **#SNPs Total** This is from eigenstrat\_snp\_coverage. The maximum number of covered SNPs, i.e. the number of SNPs in the .snp file provided to pileupcaller with `--pileupcaller_snpfile`.
 - **Number of SNPs** This is from ANGSD. The number of SNPs left after removing sites with no data in a 5 base pair surrounding region.
 - **Contamination Estimate (Method1_ML)** This is from the nuclear contamination function of ANGSD. The Maximum Likelihood contamination estimate according to Method 1. The estimates using Method of Moments and/or those based on Method 2 can be unhidden through the "Configure Columns" button.
 - **Estimate Error (Method1_ML)** This is from ANGSD. The standard error of the Method1 Maximum likelihood estimate. The errors associated with Method of Moments and/or Method2 estimates can be unhidden through the "Configure Columns" button.
@@ -721,7 +723,7 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir
 - `damageprofiler/` - this contains sample specific directories containing raw statistics and damage plots from DamageProfiler. The `.pdf` files can be used to visualise C to T miscoding lesions or read length distributions of your mapped reads. All raw statistics used for the PDF plots are contained in the `.txt` files.
 - `pmdtools/` this contains raw output statistics of pmdtools (estimates of frequencies of substitutions), and BAM files which have been filtered to remove reads that do not have a Post-mortem damage (PMD) score of `--pmdtools_threshold`.
 - `trimmed_bam/` this contains the BAM files with X number of bases trimmed off as defined with the `--bamutils_clip_half_udg_left`, `--bamutils_clip_half_udg_right`, `--bamutils_clip_none_udg_left`, and `--bamutils_clip_none_udg_right` flags and corresponding index files. You can use these BAM files for downstream analysis such as re-mapping data with more stringent parameters (if you set trimming to remove the most likely places containing damage in the read).
-- `genotyping/` this contains all the (gzipped) genotyping files produced by your genotyping module. The file suffix will have the genotyping tool name. You will have files corresponding to each of your deduplicated BAM files (except pileupcaller), or any turned-on downstream processes that create BAMs (e.g. trimmed bams or pmd tools). If `--gatk_ug_keep_realign_bam` supplied, this may also contain BAM files from InDel realignment when using GATK 3 and UnifiedGenotyping for variant calling.
+- `genotyping/` this contains all the (gzipped) genotyping files produced by your genotyping module. The file suffix will have the genotyping tool name. You will have files corresponding to each of your deduplicated BAM files (except pileupcaller), or any turned-on downstream processes that create BAMs (e.g. trimmed bams or pmd tools). If `--gatk_ug_keep_realign_bam` supplied, this may also contain BAM files from InDel realignment when using GATK 3 and UnifiedGenotyping for variant calling. When pileupcaller is used to create eigenstrat genotypes, this directory also contains eigenstrat SNP coverage statistics.
 - `multivcfanalyzer/` this contains all output from MultiVCFAnalyzer, including SNP calling statistics, various SNP table(s) and FASTA alignment files.
 - `sex_determination/` this contains the output for the sex determination run. This is a single `.tsv` file that includes a table with the Sample Name, the Nr of Autosomal SNPs, Nr of SNPs on the X/Y chromosome, the Nr of reads mapping to the Autosomes, the Nr of reads mapping to the X/Y chromosome, the relative coverage on the X/Y chromosomes, and the standard error associated with the relative coverages. These measures are provided for each bam file, one row per bam. If the `sexdeterrmine_bedfile` option has not been provided, the error bars cannot be trusted, and runtime will be considerably longer.
 - `nuclear_contamination/` this contains the output of the nuclear contamination processes. The directory contains one `*.X.contamination.out` file per individual, as well as `nuclear_contamination.txt` which is a summary table of the results for all individual. `nuclear_contamination.txt` contains a header, followed by one line per individual, comprised of the Method of Moments (MOM) and Maximum Likelihood (ML) contamination estimate (with their respective standard errors) for both Method1 and Method2.
diff --git a/environment.yml b/environment.yml
@@ -16,7 +16,7 @@ dependencies:
   - bioconda::bwa=0.7.17
   - bioconda::picard=2.22.9
   - bioconda::samtools=1.9
-  - bioconda::dedup=0.12.6
+  - bioconda::dedup=0.12.7
   - bioconda::angsd=0.933
   - bioconda::circularmapper=1.93.5
   - bioconda::gatk4=4.1.7.0
@@ -33,7 +33,7 @@ dependencies:
   - bioconda::fastp=0.20.1
   - bioconda::bamutil=1.0.14
   - bioconda::mtnucratio=0.7
-  - pysam=0.15.4 #Says python3.7 or less
+  - bioconda::pysam=0.15.4 #Says python3.7 or less
   - bioconda::kraken2=2.0.9beta
   - conda-forge::pandas=1.0.4 #.4 is python3.8+ compatible
   - bioconda::freebayes=1.3.2 #should be fine with python 3.8, but says <3.7 on webpage
@@ -43,4 +43,5 @@ dependencies:
   - conda-forge::biopython=1.76
   - conda-forge::xopen=0.9.0
   - bioconda::bowtie2=2.4.1
+  - bioconda::eigenstratdatabasetools=1.0.2
   #Missing Schmutzi,snpAD
diff --git a/main.nf b/main.nf
@@ -2603,7 +2603,7 @@ if (params.pileupcaller_snpfile.isEmpty ()) {
   path(snp) from ch_snp_for_pileupcaller.collect().dump(tag: "Pileupcaller SNP file")
 
   output:
-  tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("pileupcaller.${strandedness}.*")
+  tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("pileupcaller.${strandedness}.*") into ch_for_eigenstrat_snp_coverage
 
   script:
   def use_bed = bed.getName() != 'nf-core_eager_dummy.txt' ? "-l ${bed}" : ''
@@ -2618,7 +2618,33 @@ if (params.pileupcaller_snpfile.isEmpty ()) {
   samtools mpileup -B -q 30 -Q 30 ${use_bed} -f ${fasta} ${bam_list} | pileupCaller ${caller} ${ssmode} ${transitions_mode} --sampleNames ${sample_names} ${use_snp} -e pileupcaller.${strandedness}
   """
  }
-
+ 
+ process eigenstrat_snp_coverage {
+   label 'mc_tiny'
+   tag "${strandedness}"
+   publishDir "${params.outdir}/genotyping", mode: params.publish_dir_mode
+   
+   when:
+   params.run_genotyping && params.genotyping_tool == 'pileupcaller'
+   
+   input:
+   tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*") from ch_for_eigenstrat_snp_coverage.dump()
+   
+   output:
+   tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*.json") into ch_eigenstrat_snp_cov_for_multiqc
+   path("*_eigenstrat_coverage.txt")
+   
+   script:
+   // The following code block can be swapped in once the eigenstratdatabasetools MultiQC module becomes available.
+/*   """
+   eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt -j ${strandedness}_eigenstrat_coverage_mqc.json
+   """*/
+   """
+   eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt
+   parse_snp_cov.py ${strandedness}_eigenstrat_coverage.txt
+   """
+ }
+ 
  process genotyping_angsd {
   label 'mc_small'
   tag "${samplename}"
@@ -3143,6 +3169,7 @@ process get_software_versions {
     endorS.py --version &> v_endorSpy.txt || true
     pileupCaller --version &> v_sequencetools.txt 2>&1 || true
     bowtie2 --version | grep -a 'bowtie2-.* -fdebug' > v_bowtie2.txt || true
+    eigenstrat_snp_coverage --version | cut -d ' ' -f2 >v_eigenstrat_snp_coverage.txt || true
 
     scrape_software_versions.py &> software_versions_mqc.yaml
     """
@@ -3180,6 +3207,7 @@ process multiqc {
     file ('kraken/*') from ch_kraken_for_multiqc.collect().ifEmpty([])
     file ('hops/*') from ch_hops_for_multiqc.collect().ifEmpty([])
     file ('nuclear_contamination/*') from ch_nuclear_contamination_for_multiqc.collect().ifEmpty([])
+    file ('genotyping/*') from ch_eigenstrat_snp_cov_for_multiqc
 
     file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml")