Skip to content

Commit dbe70d6

Browse files
committed
fix
1 parent 84522ad commit dbe70d6

File tree

3 files changed

+139
-2
lines changed

3 files changed

+139
-2
lines changed

app/desktop/studio_server/eval_api.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from kiln_ai.datamodel.json_schema import string_to_json_key
2626
from kiln_ai.datamodel.prompt_id import is_frozen_prompt
2727
from kiln_ai.datamodel.run_config import KilnAgentRunConfigProperties
28+
from kiln_ai.datamodel.spec import SpecStatus
2829
from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig
2930
from kiln_ai.datamodel.task_output import normalize_rating
3031
from kiln_ai.utils.name_generator import generate_memorable_name
@@ -953,11 +954,15 @@ async def get_run_config_eval_scores(
953954
task_run_config_from_id(project_id, task_id, run_config_id)
954955

955956
# Build a mapping from eval_id to spec_id for evals that are associated with specs
957+
# Also track which eval_ids belong to archived specs so we can exclude them
956958
specs = task.specs()
957959
eval_id_to_spec_id: Dict[str, str] = {}
960+
archived_eval_ids: set[str] = set()
958961
for spec in specs:
959962
if spec.eval_id and spec.id:
960963
eval_id_to_spec_id[spec.eval_id] = spec.id
964+
if spec.status == SpecStatus.archived:
965+
archived_eval_ids.add(spec.eval_id)
961966

962967
evals = task.evals()
963968
eval_results: List[RunConfigEvalResult] = []
@@ -974,6 +979,10 @@ async def get_run_config_eval_scores(
974979
total_eval_runs = 0
975980

976981
for eval in evals:
982+
# Skip evals associated with archived specs
983+
if eval.id and eval.id in archived_eval_ids:
984+
continue
985+
977986
# Get the dataset size for this eval
978987
expected_dataset_ids = dataset_ids_in_filter(
979988
task, eval.eval_set_filter_id, readonly=True

app/desktop/studio_server/test_eval_api.py

Lines changed: 127 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
)
4848
from kiln_ai.datamodel.prompt import BasePrompt
4949
from kiln_ai.datamodel.run_config import KilnAgentRunConfigProperties
50-
from kiln_ai.datamodel.spec import Spec
50+
from kiln_ai.datamodel.spec import Spec, SpecStatus
5151
from kiln_ai.datamodel.spec_properties import DesiredBehaviourProperties, SpecType
5252
from kiln_ai.datamodel.task import TaskRunConfig
5353
from kiln_ai.datamodel.task_run import Usage
@@ -2362,6 +2362,132 @@ async def test_get_run_config_eval_scores_includes_spec_id(
23622362
assert legacy_eval_result["spec_id"] is None
23632363

23642364

2365+
@pytest.mark.asyncio
2366+
async def test_get_run_config_eval_scores_excludes_archived_specs(
2367+
client, mock_task, mock_eval, mock_eval_config, mock_run_config
2368+
):
2369+
"""Test that get_run_config_eval_scores excludes evals associated with archived specs"""
2370+
2371+
# Create an active spec
2372+
active_spec = Spec(
2373+
id="active_spec1",
2374+
name="Active Spec",
2375+
definition="Active spec definition",
2376+
properties=DesiredBehaviourProperties(
2377+
spec_type=SpecType.desired_behaviour,
2378+
core_requirement="test instruction",
2379+
desired_behaviour_description="test desired behaviour",
2380+
),
2381+
eval_id=mock_eval.id,
2382+
status=SpecStatus.active,
2383+
parent=mock_task,
2384+
)
2385+
active_spec.save_to_file()
2386+
2387+
# Create an archived spec with its own eval
2388+
archived_eval = Eval(
2389+
id="archived_eval1",
2390+
name="Archived Eval",
2391+
description="Eval for archived spec",
2392+
template=None,
2393+
eval_set_filter_id="tag::archived_eval_set",
2394+
eval_configs_filter_id="tag::archived_golden",
2395+
output_scores=[
2396+
EvalOutputScore(
2397+
name="score1",
2398+
instruction="desc1",
2399+
type=TaskOutputRatingType.five_star,
2400+
),
2401+
],
2402+
parent=mock_task,
2403+
)
2404+
archived_eval.save_to_file()
2405+
2406+
archived_eval_config = EvalConfig(
2407+
id="archived_eval_config1",
2408+
name="Archived Eval Config",
2409+
config_type=EvalConfigType.g_eval,
2410+
properties={"eval_steps": ["step1"]},
2411+
parent=archived_eval,
2412+
model_name="gpt-4",
2413+
model_provider="openai",
2414+
)
2415+
archived_eval_config.save_to_file()
2416+
archived_eval.current_config_id = archived_eval_config.id
2417+
archived_eval.save_to_file()
2418+
2419+
archived_spec = Spec(
2420+
id="archived_spec1",
2421+
name="Archived Spec",
2422+
definition="Archived spec definition",
2423+
properties=DesiredBehaviourProperties(
2424+
spec_type=SpecType.desired_behaviour,
2425+
core_requirement="test instruction",
2426+
desired_behaviour_description="test desired behaviour",
2427+
),
2428+
eval_id=archived_eval.id,
2429+
status=SpecStatus.archived,
2430+
parent=mock_task,
2431+
)
2432+
archived_spec.save_to_file()
2433+
2434+
# Build mock eval objects with explicit attributes
2435+
mock_eval_config_for_api = MagicMock()
2436+
mock_eval_config_for_api.id = mock_eval_config.id
2437+
mock_eval_config_for_api.runs.return_value = []
2438+
2439+
mock_eval_for_api = MagicMock()
2440+
mock_eval_for_api.id = mock_eval.id
2441+
mock_eval_for_api.name = mock_eval.name
2442+
mock_eval_for_api.eval_set_filter_id = mock_eval.eval_set_filter_id
2443+
mock_eval_for_api.output_scores = mock_eval.output_scores
2444+
mock_eval_for_api.current_config_id = mock_eval_config.id
2445+
mock_eval_for_api.configs.return_value = [mock_eval_config_for_api]
2446+
2447+
archived_eval_config_for_api = MagicMock()
2448+
archived_eval_config_for_api.id = archived_eval_config.id
2449+
archived_eval_config_for_api.runs.return_value = []
2450+
2451+
archived_eval_for_api = MagicMock()
2452+
archived_eval_for_api.id = archived_eval.id
2453+
archived_eval_for_api.name = archived_eval.name
2454+
archived_eval_for_api.eval_set_filter_id = archived_eval.eval_set_filter_id
2455+
archived_eval_for_api.output_scores = archived_eval.output_scores
2456+
archived_eval_for_api.current_config_id = archived_eval_config.id
2457+
archived_eval_for_api.configs.return_value = [archived_eval_config_for_api]
2458+
2459+
mock_task_for_api = MagicMock()
2460+
mock_task_for_api.evals.return_value = [mock_eval_for_api, archived_eval_for_api]
2461+
mock_task_for_api.specs.return_value = [active_spec, archived_spec]
2462+
2463+
with (
2464+
patch(
2465+
"app.desktop.studio_server.eval_api.task_from_id"
2466+
) as mock_task_from_id_patch,
2467+
patch(
2468+
"app.desktop.studio_server.eval_api.task_run_config_from_id"
2469+
) as mock_task_run_config_from_id_patch,
2470+
patch(
2471+
"app.desktop.studio_server.eval_api.dataset_ids_in_filter"
2472+
) as mock_dataset_ids_in_filter,
2473+
):
2474+
mock_task_from_id_patch.return_value = mock_task_for_api
2475+
mock_task_run_config_from_id_patch.return_value = mock_run_config
2476+
mock_dataset_ids_in_filter.return_value = set()
2477+
2478+
response = client.get(
2479+
f"/api/projects/project1/tasks/task1/run_config/{mock_run_config.id}/eval_scores"
2480+
)
2481+
2482+
assert response.status_code == 200
2483+
data = response.json()
2484+
2485+
# Only the active spec's eval should be present, not the archived one
2486+
assert len(data["eval_results"]) == 1
2487+
assert data["eval_results"][0]["eval_name"] == "Test Eval"
2488+
assert data["eval_results"][0]["spec_id"] == "active_spec1"
2489+
2490+
23652491
@pytest.mark.asyncio
23662492
async def test_get_run_configs_includes_finetunes_with_run_config(
23672493
client, mock_task_from_id, mock_task

app/web_ui/src/routes/(app)/specs/[project_id]/[task_id]/+page.svelte

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -904,7 +904,9 @@
904904
spec.id &&
905905
selected_specs.has(spec.id)
906906
? 'bg-base-200'
907-
: ''} {spec.status === 'archived' ? 'opacity-60' : ''}"
907+
: ''} {spec.status === 'archived'
908+
? 'text-base-content/60'
909+
: ''}"
908910
on:click={() => {
909911
if (select_mode) {
910912
toggle_selection(spec.id || "")

0 commit comments

Comments
 (0)