WIP

VerbalCant · VerbalCant · commit 2102bb0908a9 · 2025-05-01T06:48:25.000-04:00
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,3 +1,6 @@
 {
-    "markdown.styles": ["public/vscode_markdown.css"]
+    "markdown.styles": [
+        "public/vscode_markdown.css"
+    ],
+    "nextflow.telemetry.enabled": true
 }
diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md
@@ -1133,3 +1133,23 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --
 ## Expect: BAM input shows up in FastQC -> mapping results.
 nextflow run main.nf -profile test,docker --outdir ./results -w work/ --convert_inputbam --skip_deduplication -resume -ansi-log false -dump-channels
 ```
+
+### MTDNA HAPLOGROUP CLASSIFICATION
+
+```bash
+#### MTDNA HAPLOGROUP CLASSIFICATION with default settings
+## Expect: Directory created 'mtdna_haplogroup/<reference>/<sample_id>' containing a .txt file for each sample with haplogroup assignments
+## Expect: The haplogroup .txt file contains at minimum columns for rank, name, quality, range, and details of the haplogroup assignment
+nextflow run main.nf -profile docker,test --outdir ./results/mtdna_haplogroup_test --run_mtdna_haplogroup -resume
+
+#### MTDNA HAPLOGROUP CLASSIFICATION with specific arguments
+## Expect: Directory created 'mtdna_haplogroup/<reference>/<sample_id>' containing a .txt file for each sample with haplogroup assignments
+## Expect: The haplogroup assignment may differ based on the classification settings
+nextflow run main.nf -profile docker,test --outdir ./results/mtdna_haplogroup_test_args --run_mtdna_haplogroup --haplogrep_args '--extend-report' -resume
+
+#### MTDNA HAPLOGROUP CLASSIFICATION with custom VCF input
+## Use as input a version of the TSV that contains mitochondrial VCF files
+## Expect: Directory created 'mtdna_haplogroup/<reference>/<sample_id>' containing a .txt file for each sample with haplogroup assignments
+## Expect: The haplogroup assignment should reflect the variants in the input VCF files
+nextflow run main.nf -profile docker,test --input ~/eager_dsl2_testing/input/mtdna/mtdna_vcf_samples.tsv --outdir ./results/mtdna_haplogroup_vcf_test --run_mtdna_haplogroup -resume
+```
diff --git a/modules.json b/modules.json
@@ -180,6 +180,11 @@
                         "git_sha": "3a5fef109d113b4997c9822198664ca5f2716208",
                         "installed_by": ["modules"]
                     },
+                    "haplogrep3/classify": {
+                        "branch": "master",
+                        "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc",
+                        "installed_by": ["modules"]
+                    },
                     "kraken2/kraken2": {
                         "branch": "master",
                         "git_sha": "653218e79ffa76fde20319e9062f8b8da5cf7555",
diff --git a/nextflow.config b/nextflow.config
@@ -249,6 +249,9 @@ params {
     run_sexdeterrmine                                                = false
     sexdeterrmine_bedfile                                            = null
 
+    // mtDNA haplogroup classification
+    run_mtdna_haplogroup                                             = false
+
     // Genotyping
     run_genotyping                           = false
     genotyping_tool                          = null
diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -1590,6 +1590,21 @@
             },
             "fa_icon": "fas fa-transgender-alt",
             "help_text": ""
+        },
+        "mtdna_haplogroup_options": {
+            "title": "mtDNA Haplogroup Classification",
+            "type": "object",
+            "description": "Options for classifying mitochondrial haplogroups using Haplogrep3.",
+            "default": "",
+            "fa_icon": "fas fa-dna",
+            "properties": {
+                "run_mtdna_haplogroup": {
+                    "type": "boolean",
+                    "description": "Run Haplogrep3 to determine mitochondrial haplogroups from VCF files.",
+                    "fa_icon": "fas fa-dna",
+                    "default": false
+                }
+            }
         }
     },
     "allOf": [
@@ -1646,6 +1661,9 @@
         },
         {
             "$ref": "#/$defs/human_sex_determination"
+        },
+        {
+            "$ref": "#/$defs/mtdna_haplogroup_options"
         }
     ]
 }
diff --git a/subworkflows/local/classify_mtdna_haplogroup.nf b/subworkflows/local/classify_mtdna_haplogroup.nf
@@ -0,0 +1,42 @@
+//
+// Run classify mtdna haplogroup
+//
+
+include { addNewMetaFromAttributes                          } from '../../subworkflows/local/utils_nfcore_eager_pipeline/main'
+
+include { HAPLOGREP3_CLASSIFY as HAPLOGREP3_CLASSIFY_MTDNA  } from '../../modules/nf-core/haplogrep3/classify/main'
+
+workflow CLASSIFY_MTDNA_HAPLOGROUP {
+
+    take:
+    mtdna_haplogroup_bam // channel: [ val(meta1), [ bam ], [ bai ] ]
+
+    main:
+    ch_versions       = Channel.empty()
+    ch_multiqc_files  = Channel.empty()
+    ch_haplogroups    = Channel.empty()
+
+    if ( params.run_mtdna_haplogroup ) {
+        // Prepare input for haplogrep3
+        // The module requires a tuple with [meta, inputfile]
+        // We already have [meta, bam, bai] from the input channel
+
+        ch_input_haplogrep3 = mtdna_haplogroup_bam
+            .map {
+                // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute
+                addNewMetaFromAttributes( it, "reference" , "reference" , false )
+            }
+            .map { meta, bam, bai ->
+                [meta, bam]
+            }
+
+        // Run mtDNA haplogroup classification with haplogrep3
+        HAPLOGREP3_CLASSIFY_MTDNA(ch_input_haplogrep3)
+        ch_haplogroups      = HAPLOGREP3_CLASSIFY_MTDNA.out.txt
+        ch_versions         = ch_versions.mix(HAPLOGREP3_CLASSIFY_MTDNA.out.versions)
+    }
+
+    emit:
+    haplogroups        = ch_haplogroups    // channel: [ val(meta), path("*.txt") ]
+    versions           = ch_versions       // channel: path(versions.yml)
+}
diff --git a/subworkflows/local/haplotype_human_mtdna/main.nf b/subworkflows/local/haplotype_human_mtdna/main.nf
@@ -0,0 +1,38 @@
+// TODO nf-core: If in doubt look at other nf-core/subworkflows to see how we are doing things! :)
+//               https://github.com/nf-core/modules/tree/master/subworkflows
+//               You can also ask for help via your pull request or on the #subworkflows channel on the nf-core Slack workspace:
+//               https://nf-co.re/join
+// TODO nf-core: A subworkflow SHOULD import at least two modules
+
+include { addNewMetaFromAttributes      } from '../../subworkflows/local/utils_nfcore_eager_pipeline/main'
+include { HAPLOGREP3_CLASSIFY           } from '../../modules/nf-core/haplogrep3/classify/main'
+
+workflow HAPLOTYPE_HUMAN_MTDNA {
+
+    take:
+    ch_mtdna_vcf // channel: [ val(meta), [ vcf ] ]
+
+    main:
+    ch_versions      = Channel.empty()
+    ch_haplogroups   = Channel.empty()
+
+    // Prepare input for haplogrep3
+    // The module requires a tuple with [meta, inputfile]
+    ch_input_haplogrep3 = ch_mtdna_vcf
+        .map {
+            // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute
+            addNewMetaFromAttributes( it, "reference", "reference", false )
+        }
+
+    // Run mtDNA haplogroup classification with haplogrep3
+    HAPLOGREP3_CLASSIFY(ch_input_haplogrep3)
+    ch_haplogroups = HAPLOGREP3_CLASSIFY.out.txt
+    ch_versions    = ch_versions.mix(HAPLOGREP3_CLASSIFY.out.versions)
+
+    emit:
+    haplogroups    = ch_haplogroups    // channel: [ val(meta), path("*.txt") ]
+    versions       = ch_versions       // channel: path(versions.yml)
+}
+
+
+workflow HAPLOGREP3_CLASSIFY {
diff --git a/subworkflows/local/haplotype_human_mtdna/meta.yml b/subworkflows/local/haplotype_human_mtdna/meta.yml
@@ -0,0 +1,51 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json
+name: "haplotype_human_mtdna"
+## TODO nf-core: Add a description of the subworkflow and list keywords
+description: Sort SAM/BAM/CRAM file
+keywords:
+  - sort
+  - bam
+  - sam
+  - cram
+## TODO nf-core: Add a list of the modules and/or subworkflows used in the subworkflow
+components:
+  - samtools/sort
+  - samtools/index
+## TODO nf-core: List all of the channels used as input with a description and their structure
+input:
+  - ch_bam:
+      type: file
+      description: |
+        The input channel containing the BAM/CRAM/SAM files
+        Structure: [ val(meta), path(bam) ]
+      pattern: "*.{bam/cram/sam}"
+## TODO nf-core: List all of the channels used as output with a descriptions and their structure
+output:
+  - bam:
+      type: file
+      description: |
+        Channel containing BAM files
+        Structure: [ val(meta), path(bam) ]
+      pattern: "*.bam"
+  - bai:
+      type: file
+      description: |
+        Channel containing indexed BAM (BAI) files
+        Structure: [ val(meta), path(bai) ]
+      pattern: "*.bai"
+  - csi:
+      type: file
+      description: |
+        Channel containing CSI files
+        Structure: [ val(meta), path(csi) ]
+      pattern: "*.csi"
+  - versions:
+      type: file
+      description: |
+        File containing software versions
+        Structure: [ path(versions.yml) ]
+      pattern: "versions.yml"
+authors:
+  - "@trianglegrrl"
+maintainers:
+  - "@trianglegrrl"
diff --git a/subworkflows/local/haplotype_human_mtdna/tests/main.nf.test b/subworkflows/local/haplotype_human_mtdna/tests/main.nf.test
@@ -0,0 +1,36 @@
+// TODO nf-core: Once you have added the required tests, please run the following command to build this file:
+// nf-core subworkflows test haplotype_human_mtdna
+nextflow_workflow {
+
+    name "Test Subworkflow HAPLOTYPE_HUMAN_MTDNA"
+    script "../main.nf"
+    workflow "HAPLOTYPE_HUMAN_MTDNA"
+
+    tag "subworkflows"
+    tag "subworkflows_nfcore"
+    tag "subworkflows/haplotype_human_mtdna"
+    tag "haplogrep3"
+    tag "haplogrep3/classify"
+
+    test("homo_sapiens - vcf - mitochondrial") {
+
+        when {
+            workflow {
+                """
+                input[0] = [
+                    [ id:'test', single_end:false, reference:'hg19' ], // meta map
+                    file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/vcf/NA12878_chrM.vcf.gz', checkIfExists: true)
+                ]
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert workflow.success },
+                { assert snapshot(workflow.out.haplogroups).match() },
+                { assert snapshot(workflow.out.versions).match() }
+            )
+        }
+    }
+}
diff --git a/workflows/eager.nf b/workflows/eager.nf
@@ -32,6 +32,7 @@ include { METAGENOMICS                                        } from '../subwork
 include { ESTIMATE_CONTAMINATION                              } from '../subworkflows/local/estimate_contamination'
 include { CALCULATE_DAMAGE                                    } from '../subworkflows/local/calculate_damage'
 include { RUN_SEXDETERRMINE                                   } from '../subworkflows/local/run_sex_determination'
+include { CLASSIFY_MTDNA_HAPLOGROUP                           } from '../subworkflows/local/classify_mtdna_haplogroup'
 include { MERGE_LIBRARIES                                     } from '../subworkflows/local/merge_libraries'
 include { MERGE_LIBRARIES as MERGE_LIBRARIES_GENOTYPING       } from '../subworkflows/local/merge_libraries'
 include { GENOTYPE                                            } from '../subworkflows/local/genotype'
@@ -560,6 +561,43 @@ workflow EAGER {
         ch_multiqc_files = ch_multiqc_files.mix(GENOTYPE.out.mqc.collect { it[1] }.ifEmpty([]))
     }
 
+    //
+    // SUBWORKFLOW: Run mtDNA Haplogroup Classification
+    //
+
+    if (params.run_mtdna_haplogroup) {
+        // Ensure genotyping has run, as we need its VCF output
+        if (!params.run_genotyping) {
+            error "Cannot run mtDNA haplogroup classification (--run_mtdna_haplogroup) without running genotyping (--run_genotyping). VCF files are required as input."
+        }
+
+        // Filter the VCFs from genotyping to only include those matching the mitochondrial header
+        ch_mito_header_for_filter = REFERENCE_INDEXING.out.mitochondrion_header
+            .map { meta, header -> [ meta.id, header ] }
+
+        ch_mtdna_haplogroup_input = GENOTYPE.out.vcf
+            .map { meta, vcf, tbi ->
+                // Need meta.reference to filter based on mito header
+                def reference_id = meta.reference
+                [ reference_id, meta, vcf ]
+            }
+            .join(ch_mito_header_for_filter) // Join by reference_id
+            .filter { ref_id, meta, vcf, mito_header ->
+                // Keep only VCFs where the reference matches the expected mitochondrial header
+                // This assumes mito_header contains the specific contig name for mtDNA
+                // We might need a more robust check depending on mito_header content
+                vcf.name.contains(meta.id) // Basic check if VCF name includes sample ID - adjust filter logic as needed
+                // TODO: Refine filter logic based on actual mito_header content and VCF naming conventions
+            }
+            .map { ref_id, meta, vcf, mito_header ->
+                // Reformat to the expected [meta, vcf] structure for the subworkflow
+                [ meta, vcf ]
+            }
+
+        CLASSIFY_MTDNA_HAPLOGROUP(ch_mtdna_haplogroup_input)
+        ch_versions = ch_versions.mix(CLASSIFY_MTDNA_HAPLOGROUP.out.versions)
+    }
+
     //
     // Collate and save software versions
     //

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,6 @@`
`1`	`1`	`{`
`2`		`- "markdown.styles": ["public/vscode_markdown.css"]`
	`2`	`+ "markdown.styles": [`
	`3`	`+ "public/vscode_markdown.css"`
	`4`	`+ ],`
	`5`	`+ "nextflow.telemetry.enabled": true`
`3`	`6`	`}`