|
47 | 47 | ) |
48 | 48 | from kiln_ai.datamodel.prompt import BasePrompt |
49 | 49 | from kiln_ai.datamodel.run_config import KilnAgentRunConfigProperties |
50 | | -from kiln_ai.datamodel.spec import Spec |
| 50 | +from kiln_ai.datamodel.spec import Spec, SpecStatus |
51 | 51 | from kiln_ai.datamodel.spec_properties import DesiredBehaviourProperties, SpecType |
52 | 52 | from kiln_ai.datamodel.task import TaskRunConfig |
53 | 53 | from kiln_ai.datamodel.task_run import Usage |
@@ -2362,6 +2362,132 @@ async def test_get_run_config_eval_scores_includes_spec_id( |
2362 | 2362 | assert legacy_eval_result["spec_id"] is None |
2363 | 2363 |
|
2364 | 2364 |
|
| 2365 | +@pytest.mark.asyncio |
| 2366 | +async def test_get_run_config_eval_scores_excludes_archived_specs( |
| 2367 | + client, mock_task, mock_eval, mock_eval_config, mock_run_config |
| 2368 | +): |
| 2369 | + """Test that get_run_config_eval_scores excludes evals associated with archived specs""" |
| 2370 | + |
| 2371 | + # Create an active spec |
| 2372 | + active_spec = Spec( |
| 2373 | + id="active_spec1", |
| 2374 | + name="Active Spec", |
| 2375 | + definition="Active spec definition", |
| 2376 | + properties=DesiredBehaviourProperties( |
| 2377 | + spec_type=SpecType.desired_behaviour, |
| 2378 | + core_requirement="test instruction", |
| 2379 | + desired_behaviour_description="test desired behaviour", |
| 2380 | + ), |
| 2381 | + eval_id=mock_eval.id, |
| 2382 | + status=SpecStatus.active, |
| 2383 | + parent=mock_task, |
| 2384 | + ) |
| 2385 | + active_spec.save_to_file() |
| 2386 | + |
| 2387 | + # Create an archived spec with its own eval |
| 2388 | + archived_eval = Eval( |
| 2389 | + id="archived_eval1", |
| 2390 | + name="Archived Eval", |
| 2391 | + description="Eval for archived spec", |
| 2392 | + template=None, |
| 2393 | + eval_set_filter_id="tag::archived_eval_set", |
| 2394 | + eval_configs_filter_id="tag::archived_golden", |
| 2395 | + output_scores=[ |
| 2396 | + EvalOutputScore( |
| 2397 | + name="score1", |
| 2398 | + instruction="desc1", |
| 2399 | + type=TaskOutputRatingType.five_star, |
| 2400 | + ), |
| 2401 | + ], |
| 2402 | + parent=mock_task, |
| 2403 | + ) |
| 2404 | + archived_eval.save_to_file() |
| 2405 | + |
| 2406 | + archived_eval_config = EvalConfig( |
| 2407 | + id="archived_eval_config1", |
| 2408 | + name="Archived Eval Config", |
| 2409 | + config_type=EvalConfigType.g_eval, |
| 2410 | + properties={"eval_steps": ["step1"]}, |
| 2411 | + parent=archived_eval, |
| 2412 | + model_name="gpt-4", |
| 2413 | + model_provider="openai", |
| 2414 | + ) |
| 2415 | + archived_eval_config.save_to_file() |
| 2416 | + archived_eval.current_config_id = archived_eval_config.id |
| 2417 | + archived_eval.save_to_file() |
| 2418 | + |
| 2419 | + archived_spec = Spec( |
| 2420 | + id="archived_spec1", |
| 2421 | + name="Archived Spec", |
| 2422 | + definition="Archived spec definition", |
| 2423 | + properties=DesiredBehaviourProperties( |
| 2424 | + spec_type=SpecType.desired_behaviour, |
| 2425 | + core_requirement="test instruction", |
| 2426 | + desired_behaviour_description="test desired behaviour", |
| 2427 | + ), |
| 2428 | + eval_id=archived_eval.id, |
| 2429 | + status=SpecStatus.archived, |
| 2430 | + parent=mock_task, |
| 2431 | + ) |
| 2432 | + archived_spec.save_to_file() |
| 2433 | + |
| 2434 | + # Build mock eval objects with explicit attributes |
| 2435 | + mock_eval_config_for_api = MagicMock() |
| 2436 | + mock_eval_config_for_api.id = mock_eval_config.id |
| 2437 | + mock_eval_config_for_api.runs.return_value = [] |
| 2438 | + |
| 2439 | + mock_eval_for_api = MagicMock() |
| 2440 | + mock_eval_for_api.id = mock_eval.id |
| 2441 | + mock_eval_for_api.name = mock_eval.name |
| 2442 | + mock_eval_for_api.eval_set_filter_id = mock_eval.eval_set_filter_id |
| 2443 | + mock_eval_for_api.output_scores = mock_eval.output_scores |
| 2444 | + mock_eval_for_api.current_config_id = mock_eval_config.id |
| 2445 | + mock_eval_for_api.configs.return_value = [mock_eval_config_for_api] |
| 2446 | + |
| 2447 | + archived_eval_config_for_api = MagicMock() |
| 2448 | + archived_eval_config_for_api.id = archived_eval_config.id |
| 2449 | + archived_eval_config_for_api.runs.return_value = [] |
| 2450 | + |
| 2451 | + archived_eval_for_api = MagicMock() |
| 2452 | + archived_eval_for_api.id = archived_eval.id |
| 2453 | + archived_eval_for_api.name = archived_eval.name |
| 2454 | + archived_eval_for_api.eval_set_filter_id = archived_eval.eval_set_filter_id |
| 2455 | + archived_eval_for_api.output_scores = archived_eval.output_scores |
| 2456 | + archived_eval_for_api.current_config_id = archived_eval_config.id |
| 2457 | + archived_eval_for_api.configs.return_value = [archived_eval_config_for_api] |
| 2458 | + |
| 2459 | + mock_task_for_api = MagicMock() |
| 2460 | + mock_task_for_api.evals.return_value = [mock_eval_for_api, archived_eval_for_api] |
| 2461 | + mock_task_for_api.specs.return_value = [active_spec, archived_spec] |
| 2462 | + |
| 2463 | + with ( |
| 2464 | + patch( |
| 2465 | + "app.desktop.studio_server.eval_api.task_from_id" |
| 2466 | + ) as mock_task_from_id_patch, |
| 2467 | + patch( |
| 2468 | + "app.desktop.studio_server.eval_api.task_run_config_from_id" |
| 2469 | + ) as mock_task_run_config_from_id_patch, |
| 2470 | + patch( |
| 2471 | + "app.desktop.studio_server.eval_api.dataset_ids_in_filter" |
| 2472 | + ) as mock_dataset_ids_in_filter, |
| 2473 | + ): |
| 2474 | + mock_task_from_id_patch.return_value = mock_task_for_api |
| 2475 | + mock_task_run_config_from_id_patch.return_value = mock_run_config |
| 2476 | + mock_dataset_ids_in_filter.return_value = set() |
| 2477 | + |
| 2478 | + response = client.get( |
| 2479 | + f"/api/projects/project1/tasks/task1/run_config/{mock_run_config.id}/eval_scores" |
| 2480 | + ) |
| 2481 | + |
| 2482 | + assert response.status_code == 200 |
| 2483 | + data = response.json() |
| 2484 | + |
| 2485 | + # Only the active spec's eval should be present, not the archived one |
| 2486 | + assert len(data["eval_results"]) == 1 |
| 2487 | + assert data["eval_results"][0]["eval_name"] == "Test Eval" |
| 2488 | + assert data["eval_results"][0]["spec_id"] == "active_spec1" |
| 2489 | + |
| 2490 | + |
2365 | 2491 | @pytest.mark.asyncio |
2366 | 2492 | async def test_get_run_configs_includes_finetunes_with_run_config( |
2367 | 2493 | client, mock_task_from_id, mock_task |
|
0 commit comments