Skip to content

Commit db4e12e

Browse files
authored
Improve defaults for annotation workflows (#1155)
1 parent 5609d44 commit db4e12e

9 files changed

Lines changed: 86 additions & 25 deletions

File tree

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,12 @@
3838

3939
* Bump viash to 0.9.7 (PR #1145)
4040

41+
* `annotate/celltypist` and `workflows/annotation/celltypist`: set `--input_layer` default to `log_normalized` and `--reference_var_input` default to `filter_with_hvg` to align with upstream workflow defaults (PR #1155).
42+
43+
* `annotate/singler`: set `--input_layer` default to `log_normalized` and `--reference_var_input` default to `filter_with_hvg` to align with upstream workflow defaults (PR #1155).
44+
45+
* `workflows/annotation/scanvi_scarches`: set `--input_obs_batch_label` and `--reference_obs_batch_label` defaults to `sample_id` and `--reference_var_hvg` default to `filter_with_hvg` to align with upstream workflow defaults (PR #1155).
46+
4147
* `cluster/leiden`: added `flavor`, `n_iterations` and `seed` arguments (PR #1132)
4248

4349
* `cluster/leiden`: avoid creating unnecessary copies of the output data (PR #1132).

resources_test_scripts/annotation_test_data.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,10 @@ disease = np.random.choice(["healthy", "diseased"], size=n_cells, p=[0.5, 0.5])
6565
sub_ref_adata_final.obs["treatment"] = treatment
6666
sub_ref_adata_final.obs["disease"] = disease
6767
68+
# Strip raw slot - not needed for annotation and causes compatibility issues between AnnData/MuData versions
69+
sub_ref_adata_final = sub_ref_adata_final.copy()
70+
sub_ref_adata_final.raw = None
71+
6872
# Write out data
6973
sub_ref_adata_final.write("${OUT}/TS_Blood_filtered.h5ad", compression='gzip')
7074
HEREDOC

src/annotate/celltypist/config.vsh.yaml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ argument_groups:
2626
required: false
2727
- name: "--input_layer"
2828
type: string
29+
default: log_normalized
2930
description: The layer in the input data containing counts that are lognormalized to 10000, .X is not to be used.
3031
- name: "--input_var_gene_names"
3132
type: string
@@ -62,9 +63,10 @@ argument_groups:
6263
default: "cell_ontology_class"
6364
- name: "--reference_var_input"
6465
type: string
66+
default: "filter_with_hvg"
6567
required: false
6668
description: |
67-
.var column containing highly variable genes. By default, do not subset genes.
69+
.var column containing highly variable genes. If not provided, genes will not be subset.
6870
- name: "--reference_var_gene_names"
6971
type: string
7072
required: false
@@ -147,13 +149,14 @@ engines:
147149
- type: docker
148150
image: nvcr.io/nvidia/pytorch:25.11-py3
149151
setup:
150-
- type: python
151-
__merge__: [ /src/base/requirements/scanpy.yaml, .]
152152
- type: python
153153
packages:
154154
- celltypist==1.7.1
155155
- type: python
156156
__merge__: [ /src/base/requirements/anndata_mudata.yaml, .]
157+
test_setup:
158+
- type: python
159+
__merge__: [ /src/base/requirements/scanpy.yaml, .]
157160
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
158161
runners:
159162
- type: executable

src/annotate/celltypist/test.py

Lines changed: 34 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -21,34 +21,47 @@
2121
model_file = (
2222
f"{meta['resources_dir']}/annotation_test_data/celltypist_model_Immune_All_Low.pkl"
2323
)
24-
celltypist_input_file = (
25-
f"{meta['resources_dir']}/annotation_test_data/demo_2000_cells.h5mu"
26-
)
27-
# input_file = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"
24+
input_file_1 = f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"
25+
input_file_2 = f"{meta['resources_dir']}/annotation_test_data/demo_2000_cells.h5mu"
26+
reference_file = f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu"
2827

2928

3029
def log_normalize(adata):
31-
sc.pp.normalize_total(adata, target_sum=1e4)
32-
sc.pp.log1p(adata)
30+
adata_norm = sc.pp.normalize_total(adata, target_sum=1e4, copy=True)
31+
adata_lognorm = sc.pp.log1p(adata_norm, copy=True)
32+
adata.layers["log_normalized"] = adata_lognorm.X
33+
return adata
34+
35+
36+
def calculate_hvg(adata, n_top_genes=1000):
37+
adata_hvg = sc.pp.highly_variable_genes(adata, n_top_genes=n_top_genes, copy=True)
38+
adata.var["filter_with_hvg"] = adata_hvg.var["highly_variable"]
3339
return adata
3440

3541

3642
@pytest.fixture
3743
def reference_mdata():
38-
mdata = mu.read_h5mu(
39-
f"{meta['resources_dir']}/annotation_test_data/TS_Blood_filtered.h5mu"
40-
)
44+
mdata = mu.read_h5mu(reference_file)
45+
adata = mdata.mod["rna"] # already has layer "log_normalized" with 10k target sum
46+
adata.var["filter_with_hvg"] = adata.var[
47+
"highly_variable"
48+
] # already has highly variable genes calculated
49+
return mdata
50+
51+
52+
@pytest.fixture
53+
def input_mdata():
54+
mdata = mu.read_h5mu(input_file_1)
4155
adata = mdata.mod["rna"].copy()
56+
adata.layers["counts"] = adata.X.copy() # store raw counts in a layer
4257
adata_lognorm = log_normalize(adata)
4358
mdata.mod["rna"] = adata_lognorm
4459
return mdata
4560

4661

4762
@pytest.fixture
48-
def input_mdata():
49-
mdata = mu.read_h5mu(
50-
f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"
51-
)
63+
def model_input_mdata():
64+
mdata = mu.read_h5mu(input_file_2)
5265
adata = mdata.mod["rna"].copy()
5366
adata_lognorm = log_normalize(adata)
5467
mdata.mod["rna"] = adata_lognorm
@@ -155,15 +168,20 @@ def test_set_params(
155168
)
156169

157170

158-
def test_with_model(run_component, random_h5mu_path):
171+
def test_with_model(
172+
run_component, random_h5mu_path, write_mudata_to_file, model_input_mdata
173+
):
159174
output_file = random_h5mu_path()
175+
input_file = write_mudata_to_file(model_input_mdata)
160176

161177
run_component(
162178
[
163179
"--input",
164-
celltypist_input_file,
180+
input_file,
165181
"--model",
166182
model_file,
183+
"--reference_layer",
184+
"",
167185
"--reference_obs_targets",
168186
"cell_type",
169187
"--output",
@@ -208,7 +226,7 @@ def test_fail_invalid_input_expression(
208226
"--input",
209227
input_file,
210228
"--input_layer",
211-
"log_normalized",
229+
"counts",
212230
"--reference",
213231
reference_file,
214232
"--reference_layer",

src/annotate/singler/config.vsh.yaml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ argument_groups:
3030
required: false
3131
- name: "--input_layer"
3232
type: string
33+
default: "log_normalized"
3334
description: The layer in the input data containing log normalized counts to be used for cell type annotation if .X is not to be used.
3435
- name: "--input_var_gene_names"
3536
type: string
@@ -74,9 +75,10 @@ argument_groups:
7475
The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.
7576
- name: "--reference_var_input"
7677
type: string
78+
default: "filter_with_hvg"
7779
required: false
7880
description: |
79-
.var column containing a boolean mask corresponding to genes to be used for marker selection. By default, do not subset genes.
81+
.var column containing a boolean mask corresponding to genes to be used for marker selection. If not provided, genes will not be subset.
8082
8183
- name: Arguments
8284
description: Arguments related to the training of and classification with the SingleR model
@@ -185,6 +187,9 @@ engines:
185187
- type: python
186188
user: true
187189
__merge__: [ /src/base/requirements/anndata_mudata.yaml ]
190+
test_setup:
191+
- type: python
192+
__merge__: [ /src/base/requirements/scanpy.yaml, .]
188193
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]
189194

190195
runners:

src/annotate/singler/test.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import sys
22
import os
3+
import scanpy as sc
34
import pytest
45
import mudata as mu
56
from openpipeline_testutils.asserters import assert_annotation_objects_equal
@@ -14,17 +15,36 @@
1415
reference_file = f"{meta['resources_dir']}/TS_Blood_filtered.h5mu"
1516

1617

17-
def test_simple_execution(run_component, random_h5mu_path):
18+
def log_normalize(adata):
19+
adata_lognorm = adata.copy()
20+
sc.pp.normalize_total(adata_lognorm, target_sum=1e4)
21+
sc.pp.log1p(adata_lognorm)
22+
adata.layers["log_normalized"] = adata_lognorm.X
23+
return adata
24+
25+
26+
@pytest.fixture
27+
def input_path(write_mudata_to_file):
28+
mdata = mu.read_h5mu(input_file)
29+
adata = mdata.mod["rna"].copy()
30+
adata_lognorm = log_normalize(adata)
31+
mdata.mod["rna"] = adata_lognorm
32+
return write_mudata_to_file(mdata)
33+
34+
35+
def test_simple_execution(run_component, random_h5mu_path, input_path):
1836
output_file = random_h5mu_path()
1937

2038
run_component(
2139
[
2240
"--input",
23-
input_file,
41+
input_path,
2442
"--input_var_gene_names",
2543
"gene_symbol",
2644
"--reference",
2745
reference_file,
46+
"--reference_var_input",
47+
"highly_variable",
2848
"--reference_obs_target",
2949
"cell_ontology_class",
3050
"--output",
@@ -72,13 +92,13 @@ def test_simple_execution(run_component, random_h5mu_path):
7292
)
7393

7494

75-
def test_params(run_component, random_h5mu_path):
95+
def test_params(run_component, random_h5mu_path, input_path):
7696
output_file = random_h5mu_path()
7797

7898
run_component(
7999
[
80100
"--input",
81-
input_file,
101+
input_path,
82102
"--reference",
83103
reference_file,
84104
"--reference_obs_target",

src/workflows/annotation/celltypist/config.vsh.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,10 @@ argument_groups:
7272
The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.
7373
- name: "--reference_var_input"
7474
type: string
75+
default: "filter_with_hvg"
7576
required: false
7677
description: |
77-
.var column containing highly variable genes. By default, do not subset genes.
78+
.var column containing highly variable genes. If not provided, genes will not be subset.
7879
7980
- name: Model arguments
8081
description: Model arguments.

src/workflows/annotation/celltypist/test.nf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ workflow test_wf {
1515
input: resources_test.resolve("pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu"),
1616
reference: resources_test.resolve("annotation_test_data/TS_Blood_filtered.h5mu"),
1717
reference_var_gene_names: "ensemblid",
18+
reference_var_input: "highly_variable",
1819
input_obs_batch_label: "sample_id",
1920
reference_obs_batch_label: "donor_assay",
2021
reference_obs_target: "cell_type",

src/workflows/annotation/scanvi_scarches/config.vsh.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ argument_groups:
3939
type: string
4040
description: "The .obs field in the input (query) dataset containing the batch labels."
4141
example: "sample"
42+
default: "sample_id"
4243
required: true
4344
- name: "--input_obs_size_factor"
4445
type: string
@@ -90,6 +91,7 @@ argument_groups:
9091
description: The `.obs` key containing the target labels.
9192
- name: "--reference_obs_batch_label"
9293
type: string
94+
default: "sample_id"
9395
description: "The .obs field in the reference dataset containing the batch labels."
9496
example: "sample"
9597
required: true
@@ -127,6 +129,7 @@ argument_groups:
127129
Value in the --reference_obs_batch_label field that indicates unlabeled observations
128130
- name: "--reference_var_hvg"
129131
type: string
132+
default: "filter_with_hvg"
130133
required: false
131134
description: ".var column containing highly variable genes. If not provided, genes will not be subset."
132135
- name: "--reference_var_gene_names"

0 commit comments

Comments
 (0)