Improve defaults for annotation workflows (#1155)

dorien-er · web-flow · commit db4e12e1c5ef · 2026-04-30T08:57:53.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -38,6 +38,12 @@
 
 * Bump viash to 0.9.7 (PR #1145)
 
+* `annotate/celltypist` and `workflows/annotation/celltypist`: set `--input_layer` default to `log_normalized` and `--reference_var_input` default to `filter_with_hvg` to align with upstream workflow defaults (PR #1155).
+
+* `annotate/singler`: set `--input_layer` default to `log_normalized` and `--reference_var_input` default to `filter_with_hvg` to align with upstream workflow defaults (PR #1155).
+
+* `workflows/annotation/scanvi_scarches`: set `--input_obs_batch_label` and `--reference_obs_batch_label` defaults to `sample_id` and `--reference_var_hvg` default to `filter_with_hvg` to align with upstream workflow defaults (PR #1155).
+
 * `cluster/leiden`: added `flavor`, `n_iterations` and `seed` arguments (PR #1132)
 
 * `cluster/leiden`: avoid creating unnecessary copies of the output data (PR #1132).
diff --git a/resources_test_scripts/annotation_test_data.sh b/resources_test_scripts/annotation_test_data.sh
@@ -65,6 +65,10 @@ disease = np.random.choice(["healthy", "diseased"], size=n_cells, p=[0.5, 0.5])
 sub_ref_adata_final.obs["treatment"] = treatment
 sub_ref_adata_final.obs["disease"] = disease
 
+# Strip raw slot - not needed for annotation and causes compatibility issues between AnnData/MuData versions
+sub_ref_adata_final = sub_ref_adata_final.copy()
+sub_ref_adata_final.raw = None
+
 # Write out data
 sub_ref_adata_final.write("${OUT}/TS_Blood_filtered.h5ad", compression='gzip')
 HEREDOC
diff --git a/src/annotate/celltypist/config.vsh.yaml b/src/annotate/celltypist/config.vsh.yaml
@@ -26,6 +26,7 @@ argument_groups:
         required: false
       - name: "--input_layer"
         type: string
+        default: log_normalized
         description: The layer in the input data containing counts that are lognormalized to 10000, .X is not to be used. 
       - name: "--input_var_gene_names"
         type: string
@@ -62,9 +63,10 @@ argument_groups:
         default: "cell_ontology_class"
       - name: "--reference_var_input"
         type: string
+        default: "filter_with_hvg"
         required: false
         description: |
-          .var column containing highly variable genes. By default, do not subset genes.
+          .var column containing highly variable genes. If not provided, genes will not be subset.
       - name: "--reference_var_gene_names"
         type: string
         required: false
@@ -147,13 +149,14 @@ engines:
   - type: docker
     image: nvcr.io/nvidia/pytorch:25.11-py3
     setup:
-      - type: python
-        __merge__: [ /src/base/requirements/scanpy.yaml, .]
       - type: python
         packages:
           - celltypist==1.7.1
       - type: python
         __merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
+    test_setup:
+      - type: python
+        __merge__: [ /src/base/requirements/scanpy.yaml, .]
     __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
 runners:
   - type: executable
diff --git a/src/annotate/celltypist/test.py b/src/annotate/celltypist/test.py
@@ -21,34 +21,47 @@
 model_file = (
     f"{meta['resources_dir']}/annotation_test_data/celltypist_model_Immune_All_Low.pkl"
 )
-celltypist_input_file = (
-    f"{meta['resources_dir']}/annotation_test_data/demo_2000_cells.h5mu"
-)
-# input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"
+input_file_1 = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"
+input_file_2 = f"{meta['resources_dir']}/annotation_test_data/demo_2000_cells.h5mu"
+reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu"
 
 
 def log_normalize(adata):
-    sc.pp.normalize_total(adata, target_sum=1e4)
-    sc.pp.log1p(adata)
+    adata_norm = sc.pp.normalize_total(adata, target_sum=1e4, copy=True)
+    adata_lognorm = sc.pp.log1p(adata_norm, copy=True)
+    adata.layers["log_normalized"] = adata_lognorm.X
+    return adata
+
+
+def calculate_hvg(adata, n_top_genes=1000):
+    adata_hvg = sc.pp.highly_variable_genes(adata, n_top_genes=n_top_genes, copy=True)
+    adata.var["filter_with_hvg"] = adata_hvg.var["highly_variable"]
     return adata
 
 
 @pytest.fixture
 def reference_mdata():
-    mdata = mu.read_h5mu(
-        f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu"
-    )
+    mdata = mu.read_h5mu(reference_file)
+    adata = mdata.mod["rna"]  # already has layer "log_normalized" with 10k target sum
+    adata.var["filter_with_hvg"] = adata.var[
+        "highly_variable"
+    ]  # already has highly variable genes calculated
+    return mdata
+
+
+@pytest.fixture
+def input_mdata():
+    mdata = mu.read_h5mu(input_file_1)
     adata = mdata.mod["rna"].copy()
+    adata.layers["counts"] = adata.X.copy()  # store raw counts in a layer
     adata_lognorm = log_normalize(adata)
     mdata.mod["rna"] = adata_lognorm
     return mdata
 
 
 @pytest.fixture
-def input_mdata():
-    mdata = mu.read_h5mu(
-        f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"
-    )
+def model_input_mdata():
+    mdata = mu.read_h5mu(input_file_2)
     adata = mdata.mod["rna"].copy()
     adata_lognorm = log_normalize(adata)
     mdata.mod["rna"] = adata_lognorm
@@ -155,15 +168,20 @@ def test_set_params(
     )
 
 
-def test_with_model(run_component, random_h5mu_path):
+def test_with_model(
+    run_component, random_h5mu_path, write_mudata_to_file, model_input_mdata
+):
     output_file = random_h5mu_path()
+    input_file = write_mudata_to_file(model_input_mdata)
 
     run_component(
         [
             "--input",
-            celltypist_input_file,
+            input_file,
             "--model",
             model_file,
+            "--reference_layer",
+            "",
             "--reference_obs_targets",
             "cell_type",
             "--output",
@@ -208,7 +226,7 @@ def test_fail_invalid_input_expression(
                 "--input",
                 input_file,
                 "--input_layer",
-                "log_normalized",
+                "counts",
                 "--reference",
                 reference_file,
                 "--reference_layer",
diff --git a/src/annotate/singler/config.vsh.yaml b/src/annotate/singler/config.vsh.yaml
@@ -30,6 +30,7 @@ argument_groups:
         required: false
       - name: "--input_layer"
         type: string
+        default: "log_normalized"
         description: The layer in the input data containing log normalized counts to be used for cell type annotation if .X is not to be used. 
       - name: "--input_var_gene_names"
         type: string
@@ -74,9 +75,10 @@ argument_groups:
           The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.
       - name: "--reference_var_input"
         type: string
+        default: "filter_with_hvg"
         required: false
         description: |
-          .var column containing a boolean mask corresponding to genes to be used for marker selection. By default, do not subset genes.
+          .var column containing a boolean mask corresponding to genes to be used for marker selection. If not provided, genes will not be subset.
 
   - name: Arguments
     description: Arguments related to the training of and classification with the SingleR model
@@ -185,6 +187,9 @@ engines:
       - type: python
         user: true
         __merge__: [ /src/base/requirements/anndata_mudata.yaml ]
+    test_setup:
+      - type: python
+        __merge__: [ /src/base/requirements/scanpy.yaml, .]
     __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
 
 runners:
diff --git a/src/annotate/singler/test.py b/src/annotate/singler/test.py
@@ -1,5 +1,6 @@
 import sys
 import os
+import scanpy as sc
 import pytest
 import mudata as mu
 from openpipeline_testutils.asserters import assert_annotation_objects_equal
@@ -14,17 +15,36 @@
 reference_file = f"{meta['resources_dir']}/TS_Blood_filtered.h5mu"
 
 
-def test_simple_execution(run_component, random_h5mu_path):
+def log_normalize(adata):
+    adata_lognorm = adata.copy()
+    sc.pp.normalize_total(adata_lognorm, target_sum=1e4)
+    sc.pp.log1p(adata_lognorm)
+    adata.layers["log_normalized"] = adata_lognorm.X
+    return adata
+
+
+@pytest.fixture
+def input_path(write_mudata_to_file):
+    mdata = mu.read_h5mu(input_file)
+    adata = mdata.mod["rna"].copy()
+    adata_lognorm = log_normalize(adata)
+    mdata.mod["rna"] = adata_lognorm
+    return write_mudata_to_file(mdata)
+
+
+def test_simple_execution(run_component, random_h5mu_path, input_path):
     output_file = random_h5mu_path()
 
     run_component(
         [
             "--input",
-            input_file,
+            input_path,
             "--input_var_gene_names",
             "gene_symbol",
             "--reference",
             reference_file,
+            "--reference_var_input",
+            "highly_variable",
             "--reference_obs_target",
             "cell_ontology_class",
             "--output",
@@ -72,13 +92,13 @@ def test_simple_execution(run_component, random_h5mu_path):
     )
 
 
-def test_params(run_component, random_h5mu_path):
+def test_params(run_component, random_h5mu_path, input_path):
     output_file = random_h5mu_path()
 
     run_component(
         [
             "--input",
-            input_file,
+            input_path,
             "--reference",
             reference_file,
             "--reference_obs_target",
diff --git a/src/workflows/annotation/celltypist/config.vsh.yaml b/src/workflows/annotation/celltypist/config.vsh.yaml
@@ -72,9 +72,10 @@ argument_groups:
           The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.
       - name: "--reference_var_input"
         type: string
+        default: "filter_with_hvg"
         required: false
         description: |
-          .var column containing highly variable genes. By default, do not subset genes.
+          .var column containing highly variable genes. If not provided, genes will not be subset.
 
   - name: Model arguments
     description: Model arguments.
diff --git a/src/workflows/annotation/celltypist/test.nf b/src/workflows/annotation/celltypist/test.nf
@@ -15,6 +15,7 @@ workflow test_wf {
         input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
         reference: resources_test.resolve("annotation_test_data/TS_Blood_filtered.h5mu"),
         reference_var_gene_names: "ensemblid",
+        reference_var_input: "highly_variable",
         input_obs_batch_label: "sample_id",
         reference_obs_batch_label: "donor_assay",
         reference_obs_target: "cell_type",
diff --git a/src/workflows/annotation/scanvi_scarches/config.vsh.yaml b/src/workflows/annotation/scanvi_scarches/config.vsh.yaml
@@ -39,6 +39,7 @@ argument_groups:
         type: string
         description: "The .obs field in the input (query) dataset containing the batch labels."
         example: "sample"
+        default: "sample_id"
         required: true
       - name: "--input_obs_size_factor"
         type: string
@@ -90,6 +91,7 @@ argument_groups:
         description: The `.obs` key containing the target labels.
       - name: "--reference_obs_batch_label"
         type: string
+        default: "sample_id"
         description:  "The .obs field in the reference dataset containing the batch labels."
         example: "sample"
         required: true
@@ -127,6 +129,7 @@ argument_groups:
           Value in the --reference_obs_batch_label field that indicates unlabeled observations
       - name: "--reference_var_hvg"
         type: string
+        default: "filter_with_hvg"
         required: false
         description: ".var column containing highly variable genes. If not provided, genes will not be subset."
       - name: "--reference_var_gene_names"