Skip to content

Commit 6e0496c

Browse files
committed
Merge branch 'main' into KIL-506/task-warn
2 parents f4f1ad3 + 7e69056 commit 6e0496c

File tree

12 files changed

+753
-154
lines changed

12 files changed

+753
-154
lines changed

app/desktop/studio_server/eval_api.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from kiln_ai.datamodel.json_schema import string_to_json_key
2626
from kiln_ai.datamodel.prompt_id import is_frozen_prompt
2727
from kiln_ai.datamodel.run_config import KilnAgentRunConfigProperties
28+
from kiln_ai.datamodel.spec import SpecStatus
2829
from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig
2930
from kiln_ai.datamodel.task_output import normalize_rating
3031
from kiln_ai.utils.name_generator import generate_memorable_name
@@ -1271,11 +1272,15 @@ async def get_run_config_eval_scores(
12711272
task_run_config_from_id(project_id, task_id, run_config_id)
12721273

12731274
# Build a mapping from eval_id to spec_id for evals that are associated with specs
1275+
# Also track which eval_ids belong to archived specs so we can exclude them
12741276
specs = task.specs()
12751277
eval_id_to_spec_id: Dict[str, str] = {}
1278+
archived_eval_ids: set[str] = set()
12761279
for spec in specs:
12771280
if spec.eval_id and spec.id:
12781281
eval_id_to_spec_id[spec.eval_id] = spec.id
1282+
if spec.status == SpecStatus.archived:
1283+
archived_eval_ids.add(spec.eval_id)
12791284

12801285
evals = task.evals()
12811286
eval_results: List[RunConfigEvalResult] = []
@@ -1292,6 +1297,10 @@ async def get_run_config_eval_scores(
12921297
total_eval_runs = 0
12931298

12941299
for eval in evals:
1300+
# Skip evals associated with archived specs
1301+
if eval.id and eval.id in archived_eval_ids:
1302+
continue
1303+
12951304
# Get the dataset size for this eval
12961305
expected_dataset_ids = dataset_ids_in_filter(
12971306
task, eval.eval_set_filter_id, readonly=True

app/desktop/studio_server/test_eval_api.py

Lines changed: 127 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
)
4848
from kiln_ai.datamodel.prompt import BasePrompt
4949
from kiln_ai.datamodel.run_config import KilnAgentRunConfigProperties
50-
from kiln_ai.datamodel.spec import Spec
50+
from kiln_ai.datamodel.spec import Spec, SpecStatus
5151
from kiln_ai.datamodel.spec_properties import DesiredBehaviourProperties, SpecType
5252
from kiln_ai.datamodel.task import TaskRunConfig
5353
from kiln_ai.datamodel.task_run import Usage
@@ -2362,6 +2362,132 @@ async def test_get_run_config_eval_scores_includes_spec_id(
23622362
assert legacy_eval_result["spec_id"] is None
23632363

23642364

2365+
@pytest.mark.asyncio
2366+
async def test_get_run_config_eval_scores_excludes_archived_specs(
2367+
client, mock_task, mock_eval, mock_eval_config, mock_run_config
2368+
):
2369+
"""Test that get_run_config_eval_scores excludes evals associated with archived specs"""
2370+
2371+
# Create an active spec
2372+
active_spec = Spec(
2373+
id="active_spec1",
2374+
name="Active Spec",
2375+
definition="Active spec definition",
2376+
properties=DesiredBehaviourProperties(
2377+
spec_type=SpecType.desired_behaviour,
2378+
core_requirement="test instruction",
2379+
desired_behaviour_description="test desired behaviour",
2380+
),
2381+
eval_id=mock_eval.id,
2382+
status=SpecStatus.active,
2383+
parent=mock_task,
2384+
)
2385+
active_spec.save_to_file()
2386+
2387+
# Create an archived spec with its own eval
2388+
archived_eval = Eval(
2389+
id="archived_eval1",
2390+
name="Archived Eval",
2391+
description="Eval for archived spec",
2392+
template=None,
2393+
eval_set_filter_id="tag::archived_eval_set",
2394+
eval_configs_filter_id="tag::archived_golden",
2395+
output_scores=[
2396+
EvalOutputScore(
2397+
name="score1",
2398+
instruction="desc1",
2399+
type=TaskOutputRatingType.five_star,
2400+
),
2401+
],
2402+
parent=mock_task,
2403+
)
2404+
archived_eval.save_to_file()
2405+
2406+
archived_eval_config = EvalConfig(
2407+
id="archived_eval_config1",
2408+
name="Archived Eval Config",
2409+
config_type=EvalConfigType.g_eval,
2410+
properties={"eval_steps": ["step1"]},
2411+
parent=archived_eval,
2412+
model_name="gpt-4",
2413+
model_provider="openai",
2414+
)
2415+
archived_eval_config.save_to_file()
2416+
archived_eval.current_config_id = archived_eval_config.id
2417+
archived_eval.save_to_file()
2418+
2419+
archived_spec = Spec(
2420+
id="archived_spec1",
2421+
name="Archived Spec",
2422+
definition="Archived spec definition",
2423+
properties=DesiredBehaviourProperties(
2424+
spec_type=SpecType.desired_behaviour,
2425+
core_requirement="test instruction",
2426+
desired_behaviour_description="test desired behaviour",
2427+
),
2428+
eval_id=archived_eval.id,
2429+
status=SpecStatus.archived,
2430+
parent=mock_task,
2431+
)
2432+
archived_spec.save_to_file()
2433+
2434+
# Build mock eval objects with explicit attributes
2435+
mock_eval_config_for_api = MagicMock()
2436+
mock_eval_config_for_api.id = mock_eval_config.id
2437+
mock_eval_config_for_api.runs.return_value = []
2438+
2439+
mock_eval_for_api = MagicMock()
2440+
mock_eval_for_api.id = mock_eval.id
2441+
mock_eval_for_api.name = mock_eval.name
2442+
mock_eval_for_api.eval_set_filter_id = mock_eval.eval_set_filter_id
2443+
mock_eval_for_api.output_scores = mock_eval.output_scores
2444+
mock_eval_for_api.current_config_id = mock_eval_config.id
2445+
mock_eval_for_api.configs.return_value = [mock_eval_config_for_api]
2446+
2447+
archived_eval_config_for_api = MagicMock()
2448+
archived_eval_config_for_api.id = archived_eval_config.id
2449+
archived_eval_config_for_api.runs.return_value = []
2450+
2451+
archived_eval_for_api = MagicMock()
2452+
archived_eval_for_api.id = archived_eval.id
2453+
archived_eval_for_api.name = archived_eval.name
2454+
archived_eval_for_api.eval_set_filter_id = archived_eval.eval_set_filter_id
2455+
archived_eval_for_api.output_scores = archived_eval.output_scores
2456+
archived_eval_for_api.current_config_id = archived_eval_config.id
2457+
archived_eval_for_api.configs.return_value = [archived_eval_config_for_api]
2458+
2459+
mock_task_for_api = MagicMock()
2460+
mock_task_for_api.evals.return_value = [mock_eval_for_api, archived_eval_for_api]
2461+
mock_task_for_api.specs.return_value = [active_spec, archived_spec]
2462+
2463+
with (
2464+
patch(
2465+
"app.desktop.studio_server.eval_api.task_from_id"
2466+
) as mock_task_from_id_patch,
2467+
patch(
2468+
"app.desktop.studio_server.eval_api.task_run_config_from_id"
2469+
) as mock_task_run_config_from_id_patch,
2470+
patch(
2471+
"app.desktop.studio_server.eval_api.dataset_ids_in_filter"
2472+
) as mock_dataset_ids_in_filter,
2473+
):
2474+
mock_task_from_id_patch.return_value = mock_task_for_api
2475+
mock_task_run_config_from_id_patch.return_value = mock_run_config
2476+
mock_dataset_ids_in_filter.return_value = set()
2477+
2478+
response = client.get(
2479+
f"/api/projects/project1/tasks/task1/run_configs/{mock_run_config.id}/eval_scores"
2480+
)
2481+
2482+
assert response.status_code == 200
2483+
data = response.json()
2484+
2485+
# Only the active spec's eval should be present, not the archived one
2486+
assert len(data["eval_results"]) == 1
2487+
assert data["eval_results"][0]["eval_name"] == "Test Eval"
2488+
assert data["eval_results"][0]["spec_id"] == "active_spec1"
2489+
2490+
23652491
@pytest.mark.asyncio
23662492
async def test_get_run_configs_includes_finetunes_with_run_config(
23672493
client, mock_task_from_id, mock_task

app/web_ui/src/lib/types.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ export type DockerModelRunnerConnection =
3636
components["schemas"]["DockerModelRunnerConnection"]
3737
export type RunSummary = components["schemas"]["RunSummary"]
3838
export type PromptResponse = components["schemas"]["PromptResponse"]
39+
export type ApiPrompt = components["schemas"]["ApiPrompt"]
3940
export type ChatStrategy = components["schemas"]["ChatStrategy"]
4041
export type EvalOutputScore = components["schemas"]["EvalOutputScore"]
4142
export type EvalTemplateId = components["schemas"]["EvalTemplateId"]

app/web_ui/src/routes/(app)/prompts/[project_id]/[task_id]/+page.svelte

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
prompts_by_task_composite_id,
1212
} from "$lib/stores/prompts_store"
1313
import { onMount } from "svelte"
14-
import type { Task } from "$lib/types"
14+
import type { Task, ApiPrompt } from "$lib/types"
1515
import { createKilnError, KilnError } from "$lib/utils/error_handlers"
1616
import { getPromptType } from "./prompt_generators/prompt_generators"
1717
import InfoTooltip from "$lib/ui/info_tooltip.svelte"
@@ -88,6 +88,12 @@
8888
goto(`/prompts/${project_id}/${task_id}/edit_base_prompt`)
8989
}
9090
91+
function handleClonePrompt(prompt: ApiPrompt) {
92+
goto(
93+
`/prompts/${project_id}/${task_id}/clone/${encodeURIComponent(prompt.id)}`,
94+
)
95+
}
96+
9197
type TableColumn = {
9298
key: string
9399
label: string
@@ -303,6 +309,13 @@
303309
Set as Base Prompt
304310
</button>
305311
</li>
312+
<li>
313+
<button
314+
on:click={() => handleClonePrompt(prompt)}
315+
>
316+
Clone
317+
</button>
318+
</li>
306319
</ul>
307320
</Float>
308321
</div>
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
<script lang="ts">
2+
import AppPage from "../../../../../app_page.svelte"
3+
import { page } from "$app/stores"
4+
import { KilnError, createKilnError } from "$lib/utils/error_handlers"
5+
import {
6+
load_task_prompts,
7+
prompts_by_task_composite_id,
8+
} from "$lib/stores/prompts_store"
9+
import { get_task_composite_id } from "$lib/stores"
10+
import { onMount } from "svelte"
11+
import PromptForm from "../../prompt_form.svelte"
12+
13+
$: project_id = $page.params.project_id!
14+
$: task_id = $page.params.task_id!
15+
$: prompt_id = $page.params.prompt_id!
16+
17+
let initial_prompt_name = ""
18+
let initial_prompt = ""
19+
let initial_chain_of_thought_instructions: string | null = null
20+
let loading = true
21+
let loading_error: KilnError | null = null
22+
23+
onMount(async () => {
24+
try {
25+
await load_task_prompts(project_id, task_id)
26+
const task_prompts =
27+
$prompts_by_task_composite_id[
28+
get_task_composite_id(project_id, task_id)
29+
]
30+
const source_prompt = task_prompts?.prompts.find(
31+
(p) => p.id === prompt_id,
32+
)
33+
34+
if (!source_prompt) {
35+
throw new KilnError("Source prompt not found.")
36+
}
37+
38+
initial_prompt_name = `Copy of ${source_prompt.name}`
39+
initial_prompt = source_prompt.prompt
40+
initial_chain_of_thought_instructions =
41+
source_prompt.chain_of_thought_instructions || null
42+
} catch (e) {
43+
loading_error = createKilnError(e)
44+
} finally {
45+
loading = false
46+
}
47+
})
48+
</script>
49+
50+
<div class="max-w-[1400px]">
51+
<AppPage
52+
title="Clone Prompt"
53+
sub_subtitle="Read the Docs"
54+
sub_subtitle_link="https://docs.kiln.tech/docs/prompts"
55+
breadcrumbs={[
56+
{
57+
label: "Optimize",
58+
href: `/optimize/${project_id}/${task_id}`,
59+
},
60+
{
61+
label: "Prompts",
62+
href: `/prompts/${project_id}/${task_id}`,
63+
},
64+
]}
65+
>
66+
{#if loading}
67+
<div class="w-full min-h-[50vh] flex justify-center items-center">
68+
<div class="loading loading-spinner loading-lg"></div>
69+
</div>
70+
{:else if loading_error}
71+
<div class="text-error text-sm">
72+
{loading_error.getMessage() || "An unknown error occurred"}
73+
</div>
74+
{:else}
75+
<PromptForm
76+
{project_id}
77+
{task_id}
78+
clone_mode={true}
79+
{initial_prompt_name}
80+
{initial_prompt}
81+
{initial_chain_of_thought_instructions}
82+
/>
83+
{/if}
84+
</AppPage>
85+
</div>
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
export const prerender = false

0 commit comments

Comments
 (0)