Skip to content

Commit 30c00f2

Browse files
authored
Merge pull request #1192 from Kiln-AI/KIL-491/spec-eval-name-sync
Sync eval name when spec name is updated
2 parents 20267e4 + 2ce9a66 commit 30c00f2

File tree

4 files changed

+335
-13
lines changed

4 files changed

+335
-13
lines changed

app/desktop/studio_server/copilot_api.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,7 @@ async def create_spec_with_copilot(
303303
models_to_save: list[Eval | EvalConfig | TaskRun | Spec] = []
304304

305305
# 1. Create the Eval
306-
eval_model = Eval(
306+
eval = Eval(
307307
parent=task,
308308
name=request.name,
309309
description=None,
@@ -315,11 +315,11 @@ async def create_spec_with_copilot(
315315
template_properties=None,
316316
evaluation_data_type=evaluation_data_type,
317317
)
318-
models_to_save.append(eval_model)
318+
models_to_save.append(eval)
319319

320320
# 2. Create judge eval config
321321
eval_config = EvalConfig(
322-
parent=eval_model,
322+
parent=eval,
323323
name=generate_memorable_name(),
324324
config_type=EvalConfigType.llm_as_judge,
325325
model_name=request.judge_info.task_metadata.model_name,
@@ -332,7 +332,7 @@ async def create_spec_with_copilot(
332332
models_to_save.append(eval_config)
333333

334334
# Set as default config after ID is assigned
335-
eval_model.current_config_id = eval_config.id
335+
eval.current_config_id = eval_config.id
336336

337337
# 3. Generate examples via copilot API
338338
api_key = get_copilot_api_key()
@@ -379,7 +379,7 @@ async def create_spec_with_copilot(
379379
priority=Priority.p1,
380380
status=SpecStatus.active,
381381
tags=[],
382-
eval_id=eval_model.id,
382+
eval_id=eval.id,
383383
task_sample=request.task_sample,
384384
synthetic_data_generation_session_config=SyntheticDataGenerationSessionConfig(
385385
topic_generation_config=SyntheticDataGenerationStepConfig(
@@ -405,8 +405,8 @@ async def create_spec_with_copilot(
405405
# Save everything, with cleanup on failure.
406406
saved_models: list[Eval | EvalConfig | TaskRun | Spec] = []
407407
try:
408-
eval_model.save_to_file()
409-
saved_models.append(eval_model)
408+
eval.save_to_file()
409+
saved_models.append(eval)
410410

411411
eval_config.save_to_file()
412412
saved_models.append(eval_config)

app/desktop/studio_server/test_copilot_api.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
from app.desktop.studio_server.copilot_api import connect_copilot_api
1414
from fastapi import FastAPI
1515
from fastapi.testclient import TestClient
16+
from kiln_ai.datamodel import Project, Task
17+
from kiln_ai.datamodel.spec_properties import SpecType
1618
from kiln_server.custom_errors import connect_custom_errors
1719

1820

@@ -349,3 +351,96 @@ def test_generate_batch_validation_error(
349351
)
350352
assert response.status_code == 422
351353
assert "Validation error from server" in response.json()["message"]
354+
355+
356+
class TestCreateSpecWithCopilot:
357+
@pytest.fixture
358+
def project_and_task(self, tmp_path):
359+
project_path = tmp_path / "test_project" / "project.kiln"
360+
project_path.parent.mkdir()
361+
project = Project(name="Test Project", path=project_path)
362+
project.save_to_file()
363+
task = Task(
364+
name="Test Task",
365+
instruction="Test instruction",
366+
description="Test task",
367+
parent=project,
368+
)
369+
task.save_to_file()
370+
return project, task
371+
372+
@pytest.fixture
373+
def copilot_request_data(self):
374+
step_config = {
375+
"task_metadata": {
376+
"model_name": "gpt-4",
377+
"model_provider_name": "openai",
378+
},
379+
"prompt": "Test prompt",
380+
}
381+
return {
382+
"name": "Test Spec",
383+
"definition": "The system should respond politely",
384+
"properties": {
385+
"spec_type": SpecType.tone.value,
386+
"core_requirement": "Be polite",
387+
"tone_description": "Professional and friendly",
388+
},
389+
"judge_info": step_config,
390+
"sdg_session_config": {
391+
"topic_generation_config": step_config,
392+
"input_generation_config": step_config,
393+
"output_generation_config": step_config,
394+
},
395+
"task_description": "Test task",
396+
"task_prompt_with_example": "Test prompt",
397+
}
398+
399+
def test_create_spec_with_copilot_success(
400+
self, client, project_and_task, copilot_request_data
401+
):
402+
project, task = project_and_task
403+
404+
with (
405+
patch(
406+
"app.desktop.studio_server.copilot_api.task_from_id",
407+
return_value=task,
408+
),
409+
patch(
410+
"app.desktop.studio_server.copilot_api.get_copilot_api_key",
411+
return_value="test_key",
412+
),
413+
patch(
414+
"app.desktop.studio_server.copilot_api.generate_copilot_examples",
415+
new_callable=AsyncMock,
416+
return_value={},
417+
),
418+
patch(
419+
"app.desktop.studio_server.copilot_api.create_dataset_task_runs",
420+
return_value=[],
421+
),
422+
patch(
423+
"app.desktop.studio_server.copilot_api.generate_memorable_name",
424+
return_value="test-config-name",
425+
),
426+
):
427+
response = client.post(
428+
f"/api/projects/{project.id}/tasks/{task.id}/spec_with_copilot",
429+
json=copilot_request_data,
430+
)
431+
432+
assert response.status_code == 200
433+
res = response.json()
434+
assert res["name"] == "Test Spec"
435+
assert res["definition"] == "The system should respond politely"
436+
assert res["eval_id"] is not None
437+
438+
# Verify models were saved
439+
evals = task.evals()
440+
assert len(evals) == 1
441+
assert evals[0].name == "Test Spec"
442+
assert evals[0].current_config_id is not None
443+
444+
specs = task.specs()
445+
assert len(specs) == 1
446+
assert specs[0].eval_id == evals[0].id

libs/server/kiln_server/spec_api.py

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ async def create_spec(
112112
spec_type, spec_data.evaluate_full_trace
113113
)
114114

115-
eval_model = Eval(
115+
eval = Eval(
116116
parent=task,
117117
name=spec_data.name,
118118
description=None,
@@ -133,15 +133,15 @@ async def create_spec(
133133
priority=spec_data.priority,
134134
status=spec_data.status,
135135
tags=spec_data.tags,
136-
eval_id=eval_model.id,
136+
eval_id=eval.id,
137137
task_sample=spec_data.task_sample,
138138
)
139139

140-
eval_model.save_to_file()
140+
eval.save_to_file()
141141
try:
142142
spec.save_to_file()
143143
except Exception:
144-
eval_model.delete()
144+
eval.delete()
145145
raise
146146

147147
return spec
@@ -217,7 +217,30 @@ async def update_spec(
217217
if request.tags is not None:
218218
spec.tags = request.tags
219219

220-
spec.save_to_file()
220+
# Sync eval name when spec name changes
221+
eval: Eval | None = None
222+
previous_eval_name: str | None = None
223+
if request.name is not None and spec.eval_id:
224+
parent_task = task_from_id(project_id, task_id)
225+
eval = Eval.from_id_and_parent_path(spec.eval_id, parent_task.path)
226+
if eval and eval.name != request.name:
227+
previous_eval_name = eval.name
228+
eval.name = request.name
229+
eval.save_to_file()
230+
231+
try:
232+
spec.save_to_file()
233+
except Exception:
234+
if eval is not None and previous_eval_name is not None:
235+
try:
236+
eval.name = previous_eval_name
237+
eval.save_to_file()
238+
except Exception:
239+
logger.exception(
240+
"Failed to roll back eval name after spec save failure"
241+
)
242+
raise
243+
221244
return spec
222245

223246
@app.delete(
@@ -241,7 +264,9 @@ async def delete_spec(
241264
# Delete associated eval if it exists
242265
if spec.eval_id:
243266
parent_task = task_from_id(project_id, task_id)
244-
eval = Eval.from_id_and_parent_path(spec.eval_id, parent_task.path)
267+
eval: Eval | None = Eval.from_id_and_parent_path(
268+
spec.eval_id, parent_task.path
269+
)
245270
if eval:
246271
eval.delete()
247272

0 commit comments

Comments
 (0)