Skip to content

Commit 5109268

Browse files
authored
Merge pull request #1191 from Kiln-AI/KIL-492/compare-archive-specs
Remove archived specs from compare evals screen
2 parents 395ce8b + dbe70d6 commit 5109268

File tree

3 files changed

+139
-2
lines changed

3 files changed

+139
-2
lines changed

app/desktop/studio_server/eval_api.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from kiln_ai.datamodel.json_schema import string_to_json_key
2626
from kiln_ai.datamodel.prompt_id import is_frozen_prompt
2727
from kiln_ai.datamodel.run_config import KilnAgentRunConfigProperties
28+
from kiln_ai.datamodel.spec import SpecStatus
2829
from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig
2930
from kiln_ai.datamodel.task_output import normalize_rating
3031
from kiln_ai.utils.name_generator import generate_memorable_name
@@ -1271,11 +1272,15 @@ async def get_run_config_eval_scores(
12711272
task_run_config_from_id(project_id, task_id, run_config_id)
12721273

12731274
# Build a mapping from eval_id to spec_id for evals that are associated with specs
1275+
# Also track which eval_ids belong to archived specs so we can exclude them
12741276
specs = task.specs()
12751277
eval_id_to_spec_id: Dict[str, str] = {}
1278+
archived_eval_ids: set[str] = set()
12761279
for spec in specs:
12771280
if spec.eval_id and spec.id:
12781281
eval_id_to_spec_id[spec.eval_id] = spec.id
1282+
if spec.status == SpecStatus.archived:
1283+
archived_eval_ids.add(spec.eval_id)
12791284

12801285
evals = task.evals()
12811286
eval_results: List[RunConfigEvalResult] = []
@@ -1292,6 +1297,10 @@ async def get_run_config_eval_scores(
12921297
total_eval_runs = 0
12931298

12941299
for eval in evals:
1300+
# Skip evals associated with archived specs
1301+
if eval.id and eval.id in archived_eval_ids:
1302+
continue
1303+
12951304
# Get the dataset size for this eval
12961305
expected_dataset_ids = dataset_ids_in_filter(
12971306
task, eval.eval_set_filter_id, readonly=True

app/desktop/studio_server/test_eval_api.py

Lines changed: 127 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
)
4848
from kiln_ai.datamodel.prompt import BasePrompt
4949
from kiln_ai.datamodel.run_config import KilnAgentRunConfigProperties
50-
from kiln_ai.datamodel.spec import Spec
50+
from kiln_ai.datamodel.spec import Spec, SpecStatus
5151
from kiln_ai.datamodel.spec_properties import DesiredBehaviourProperties, SpecType
5252
from kiln_ai.datamodel.task import TaskRunConfig
5353
from kiln_ai.datamodel.task_run import Usage
@@ -2362,6 +2362,132 @@ async def test_get_run_config_eval_scores_includes_spec_id(
23622362
assert legacy_eval_result["spec_id"] is None
23632363

23642364

2365+
@pytest.mark.asyncio
2366+
async def test_get_run_config_eval_scores_excludes_archived_specs(
2367+
client, mock_task, mock_eval, mock_eval_config, mock_run_config
2368+
):
2369+
"""Test that get_run_config_eval_scores excludes evals associated with archived specs"""
2370+
2371+
# Create an active spec
2372+
active_spec = Spec(
2373+
id="active_spec1",
2374+
name="Active Spec",
2375+
definition="Active spec definition",
2376+
properties=DesiredBehaviourProperties(
2377+
spec_type=SpecType.desired_behaviour,
2378+
core_requirement="test instruction",
2379+
desired_behaviour_description="test desired behaviour",
2380+
),
2381+
eval_id=mock_eval.id,
2382+
status=SpecStatus.active,
2383+
parent=mock_task,
2384+
)
2385+
active_spec.save_to_file()
2386+
2387+
# Create an archived spec with its own eval
2388+
archived_eval = Eval(
2389+
id="archived_eval1",
2390+
name="Archived Eval",
2391+
description="Eval for archived spec",
2392+
template=None,
2393+
eval_set_filter_id="tag::archived_eval_set",
2394+
eval_configs_filter_id="tag::archived_golden",
2395+
output_scores=[
2396+
EvalOutputScore(
2397+
name="score1",
2398+
instruction="desc1",
2399+
type=TaskOutputRatingType.five_star,
2400+
),
2401+
],
2402+
parent=mock_task,
2403+
)
2404+
archived_eval.save_to_file()
2405+
2406+
archived_eval_config = EvalConfig(
2407+
id="archived_eval_config1",
2408+
name="Archived Eval Config",
2409+
config_type=EvalConfigType.g_eval,
2410+
properties={"eval_steps": ["step1"]},
2411+
parent=archived_eval,
2412+
model_name="gpt-4",
2413+
model_provider="openai",
2414+
)
2415+
archived_eval_config.save_to_file()
2416+
archived_eval.current_config_id = archived_eval_config.id
2417+
archived_eval.save_to_file()
2418+
2419+
archived_spec = Spec(
2420+
id="archived_spec1",
2421+
name="Archived Spec",
2422+
definition="Archived spec definition",
2423+
properties=DesiredBehaviourProperties(
2424+
spec_type=SpecType.desired_behaviour,
2425+
core_requirement="test instruction",
2426+
desired_behaviour_description="test desired behaviour",
2427+
),
2428+
eval_id=archived_eval.id,
2429+
status=SpecStatus.archived,
2430+
parent=mock_task,
2431+
)
2432+
archived_spec.save_to_file()
2433+
2434+
# Build mock eval objects with explicit attributes
2435+
mock_eval_config_for_api = MagicMock()
2436+
mock_eval_config_for_api.id = mock_eval_config.id
2437+
mock_eval_config_for_api.runs.return_value = []
2438+
2439+
mock_eval_for_api = MagicMock()
2440+
mock_eval_for_api.id = mock_eval.id
2441+
mock_eval_for_api.name = mock_eval.name
2442+
mock_eval_for_api.eval_set_filter_id = mock_eval.eval_set_filter_id
2443+
mock_eval_for_api.output_scores = mock_eval.output_scores
2444+
mock_eval_for_api.current_config_id = mock_eval_config.id
2445+
mock_eval_for_api.configs.return_value = [mock_eval_config_for_api]
2446+
2447+
archived_eval_config_for_api = MagicMock()
2448+
archived_eval_config_for_api.id = archived_eval_config.id
2449+
archived_eval_config_for_api.runs.return_value = []
2450+
2451+
archived_eval_for_api = MagicMock()
2452+
archived_eval_for_api.id = archived_eval.id
2453+
archived_eval_for_api.name = archived_eval.name
2454+
archived_eval_for_api.eval_set_filter_id = archived_eval.eval_set_filter_id
2455+
archived_eval_for_api.output_scores = archived_eval.output_scores
2456+
archived_eval_for_api.current_config_id = archived_eval_config.id
2457+
archived_eval_for_api.configs.return_value = [archived_eval_config_for_api]
2458+
2459+
mock_task_for_api = MagicMock()
2460+
mock_task_for_api.evals.return_value = [mock_eval_for_api, archived_eval_for_api]
2461+
mock_task_for_api.specs.return_value = [active_spec, archived_spec]
2462+
2463+
with (
2464+
patch(
2465+
"app.desktop.studio_server.eval_api.task_from_id"
2466+
) as mock_task_from_id_patch,
2467+
patch(
2468+
"app.desktop.studio_server.eval_api.task_run_config_from_id"
2469+
) as mock_task_run_config_from_id_patch,
2470+
patch(
2471+
"app.desktop.studio_server.eval_api.dataset_ids_in_filter"
2472+
) as mock_dataset_ids_in_filter,
2473+
):
2474+
mock_task_from_id_patch.return_value = mock_task_for_api
2475+
mock_task_run_config_from_id_patch.return_value = mock_run_config
2476+
mock_dataset_ids_in_filter.return_value = set()
2477+
2478+
response = client.get(
2479+
f"/api/projects/project1/tasks/task1/run_config/{mock_run_config.id}/eval_scores"
2480+
)
2481+
2482+
assert response.status_code == 200
2483+
data = response.json()
2484+
2485+
# Only the active spec's eval should be present, not the archived one
2486+
assert len(data["eval_results"]) == 1
2487+
assert data["eval_results"][0]["eval_name"] == "Test Eval"
2488+
assert data["eval_results"][0]["spec_id"] == "active_spec1"
2489+
2490+
23652491
@pytest.mark.asyncio
23662492
async def test_get_run_configs_includes_finetunes_with_run_config(
23672493
client, mock_task_from_id, mock_task

app/web_ui/src/routes/(app)/specs/[project_id]/[task_id]/+page.svelte

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -904,7 +904,9 @@
904904
spec.id &&
905905
selected_specs.has(spec.id)
906906
? 'bg-base-200'
907-
: ''} {spec.status === 'archived' ? 'opacity-60' : ''}"
907+
: ''} {spec.status === 'archived'
908+
? 'text-base-content/60'
909+
: ''}"
908910
on:click={() => {
909911
if (select_mode) {
910912
toggle_selection(spec.id || "")

0 commit comments

Comments
 (0)