Skip to content

Commit b0f890f

Browse files
sfierroclaude
andcommitted
Preserve feedback from spec review as Feedback children
When creating TaskRuns from reviewed examples in the copilot flow, create Feedback children (with source=spec-feedback) after saving the run, so review feedback is not lost. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 8c58f6f commit b0f890f

File tree

4 files changed

+92
-31
lines changed

4 files changed

+92
-31
lines changed

app/desktop/studio_server/copilot_api.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -370,14 +370,15 @@ async def create_spec_with_copilot(
370370
)
371371

372372
# 4. Create TaskRuns for eval, train, and golden datasets
373-
task_runs = create_dataset_task_runs(
373+
dataset_runs = create_dataset_task_runs(
374374
all_examples=all_examples,
375375
reviewed_examples=request.reviewed_examples,
376376
eval_tag=eval_tag,
377377
train_tag=train_tag,
378378
golden_tag=golden_tag,
379379
spec_name=request.name,
380380
)
381+
task_runs = dataset_runs.task_runs
381382
for run in task_runs:
382383
run.parent = task
383384
models_to_save.extend(task_runs)
@@ -430,6 +431,7 @@ async def create_spec_with_copilot(
430431
for run in task_runs:
431432
run.save_to_file()
432433
saved_models.append(run)
434+
dataset_runs.save_pending_feedback(run)
433435

434436
spec.save_to_file()
435437
saved_models.append(spec)

app/desktop/studio_server/test_copilot_api.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
RefineSpecApiOutput,
1212
)
1313
from app.desktop.studio_server.copilot_api import connect_copilot_api
14+
from app.desktop.studio_server.utils.copilot_utils import DatasetTaskRuns
1415
from fastapi import FastAPI
1516
from fastapi.testclient import TestClient
1617
from kiln_ai.datamodel import Project, Task
@@ -417,7 +418,7 @@ def test_create_spec_with_copilot_success(
417418
),
418419
patch(
419420
"app.desktop.studio_server.copilot_api.create_dataset_task_runs",
420-
return_value=[],
421+
return_value=DatasetTaskRuns(),
421422
),
422423
patch(
423424
"app.desktop.studio_server.copilot_api.generate_memorable_name",

app/desktop/studio_server/utils/copilot_utils.py

Lines changed: 47 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
)
2626
from app.desktop.studio_server.utils.response_utils import unwrap_response
2727
from fastapi import HTTPException
28-
from kiln_ai.datamodel import TaskRun
28+
from kiln_ai.datamodel import Feedback, FeedbackSource, TaskRun
2929
from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType
3030
from kiln_ai.datamodel.task_output import (
3131
DataSource,
@@ -172,8 +172,12 @@ def create_task_run_from_reviewed(
172172
tag: str,
173173
spec_name: str,
174174
extra_tags: list[str] | None = None,
175-
) -> TaskRun:
176-
"""Create a TaskRun from a reviewed example with rating (without parent set)."""
175+
) -> tuple[TaskRun, str | None]:
176+
"""Create a TaskRun from a reviewed example with rating (without parent set).
177+
178+
Returns a (TaskRun, feedback_text) tuple. The caller should create a Feedback
179+
child on the TaskRun after saving it, if feedback_text is not None.
180+
"""
177181
data_source = DataSource(
178182
type=DataSourceType.synthetic,
179183
properties={
@@ -190,7 +194,7 @@ def create_task_run_from_reviewed(
190194
rating_key = f"named::{spec_name}"
191195
rating_value = 1.0 if example.user_says_meets_spec else 0.0
192196

193-
return TaskRun(
197+
task_run = TaskRun(
194198
input=example.input,
195199
input_source=data_source,
196200
output=TaskOutput(
@@ -209,6 +213,34 @@ def create_task_run_from_reviewed(
209213
),
210214
tags=tags,
211215
)
216+
feedback_text = example.feedback if example.feedback else None
217+
return task_run, feedback_text
218+
219+
220+
class DatasetTaskRuns:
221+
"""Result of creating dataset task runs, with pending feedback to attach after saving."""
222+
223+
def __init__(self) -> None:
224+
self.task_runs: list[TaskRun] = []
225+
self._pending_feedback: dict[str, str] = {}
226+
227+
def add_run(self, task_run: TaskRun, feedback_text: str | None = None) -> None:
228+
self.task_runs.append(task_run)
229+
if feedback_text and task_run.id:
230+
self._pending_feedback[task_run.id] = feedback_text
231+
232+
def save_pending_feedback(self, task_run: TaskRun) -> None:
233+
"""Create Feedback children for a saved TaskRun if it has pending feedback."""
234+
if not task_run.id:
235+
return
236+
feedback_text = self._pending_feedback.get(task_run.id)
237+
if feedback_text:
238+
fb = Feedback(
239+
feedback=feedback_text,
240+
source=FeedbackSource.spec_feedback,
241+
parent=task_run,
242+
)
243+
fb.save_to_file()
212244

213245

214246
def create_dataset_task_runs(
@@ -218,17 +250,18 @@ def create_dataset_task_runs(
218250
train_tag: str,
219251
golden_tag: str,
220252
spec_name: str,
221-
) -> list[TaskRun]:
253+
) -> DatasetTaskRuns:
222254
"""Create TaskRuns for eval, train, and golden datasets.
223255
224256
Samples from all_examples (mutating it) and creates TaskRuns for:
225257
- Eval dataset
226258
- Train dataset
227259
- Golden dataset (reviewed examples + unrated examples to reach MIN_GOLDEN_EXAMPLES)
228260
229-
Returns TaskRuns without parent set - caller must set parent.
261+
Returns DatasetTaskRuns without parent set - caller must set parent and call
262+
save_pending_feedback after saving each run.
230263
"""
231-
task_runs: list[TaskRun] = []
264+
result = DatasetTaskRuns()
232265

233266
# Generate a session tag for all task runs in this batch
234267
session_id = random.randint(0, 999999999999)
@@ -237,18 +270,17 @@ def create_dataset_task_runs(
237270

238271
# Create TaskRuns for reviewed examples with ratings
239272
for reviewed in reviewed_examples:
240-
task_runs.append(
241-
create_task_run_from_reviewed(reviewed, golden_tag, spec_name, extra_tags)
273+
task_run, feedback_text = create_task_run_from_reviewed(
274+
reviewed, golden_tag, spec_name, extra_tags
242275
)
276+
result.add_run(task_run, feedback_text)
243277

244278
# Create more unrated golden examples from remaining pool if needed
245279
unrated_golden_count = max(0, MIN_GOLDEN_EXAMPLES - len(reviewed_examples))
246280
if unrated_golden_count > 0:
247281
unrated_golden_examples = sample_and_remove(all_examples, unrated_golden_count)
248282
for example in unrated_golden_examples:
249-
task_runs.append(
250-
create_task_run_from_sample(example, golden_tag, extra_tags)
251-
)
283+
result.add_run(create_task_run_from_sample(example, golden_tag, extra_tags))
252284

253285
# Sample half the remaining examples for eval vs train datasets
254286
example_count = len(all_examples)
@@ -259,10 +291,10 @@ def create_dataset_task_runs(
259291

260292
# Create TaskRuns for eval examples
261293
for example in eval_examples:
262-
task_runs.append(create_task_run_from_sample(example, eval_tag, extra_tags))
294+
result.add_run(create_task_run_from_sample(example, eval_tag, extra_tags))
263295

264296
# Create TaskRuns for train examples
265297
for example in train_examples:
266-
task_runs.append(create_task_run_from_sample(example, train_tag, extra_tags))
298+
result.add_run(create_task_run_from_sample(example, train_tag, extra_tags))
267299

268-
return task_runs
300+
return result

app/desktop/studio_server/utils/test_copilot_utils.py

Lines changed: 40 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ def test_creates_task_run_with_correct_input(self):
145145
user_says_meets_spec=True,
146146
feedback="Good example",
147147
)
148-
task_run = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
148+
task_run, _ = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
149149
assert task_run.input == "test input"
150150

151151
def test_creates_task_run_with_correct_output(self):
@@ -156,7 +156,7 @@ def test_creates_task_run_with_correct_output(self):
156156
user_says_meets_spec=True,
157157
feedback="",
158158
)
159-
task_run = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
159+
task_run, _ = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
160160
assert task_run.output.output == "test output"
161161

162162
def test_creates_task_run_with_pass_rating_when_meets_spec(self):
@@ -167,7 +167,7 @@ def test_creates_task_run_with_pass_rating_when_meets_spec(self):
167167
user_says_meets_spec=True,
168168
feedback="",
169169
)
170-
task_run = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
170+
task_run, _ = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
171171
rating_key = "named::My Spec"
172172
assert rating_key in task_run.output.rating.requirement_ratings
173173
assert task_run.output.rating.requirement_ratings[rating_key].value == 1.0
@@ -180,7 +180,7 @@ def test_creates_task_run_with_fail_rating_when_not_meets_spec(self):
180180
user_says_meets_spec=False,
181181
feedback="Bad example",
182182
)
183-
task_run = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
183+
task_run, _ = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
184184
rating_key = "named::My Spec"
185185
assert rating_key in task_run.output.rating.requirement_ratings
186186
assert task_run.output.rating.requirement_ratings[rating_key].value == 0.0
@@ -193,7 +193,7 @@ def test_creates_task_run_with_tag(self):
193193
user_says_meets_spec=True,
194194
feedback="",
195195
)
196-
task_run = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
196+
task_run, _ = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
197197
assert "golden_tag" in task_run.tags
198198

199199
def test_creates_task_run_with_extra_tags(self):
@@ -204,7 +204,7 @@ def test_creates_task_run_with_extra_tags(self):
204204
user_says_meets_spec=True,
205205
feedback="",
206206
)
207-
task_run = create_task_run_from_reviewed(
207+
task_run, _ = create_task_run_from_reviewed(
208208
example, "golden_tag", "My Spec", extra_tags=["session_456"]
209209
)
210210
assert "golden_tag" in task_run.tags
@@ -218,13 +218,39 @@ def test_creates_task_run_with_pass_fail_rating_type(self):
218218
user_says_meets_spec=True,
219219
feedback="",
220220
)
221-
task_run = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
221+
task_run, _ = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
222222
rating_key = "named::My Spec"
223223
assert (
224224
task_run.output.rating.requirement_ratings[rating_key].type
225225
== TaskOutputRatingType.pass_fail
226226
)
227227

228+
def test_returns_feedback_text_when_present(self):
229+
example = ReviewedExample(
230+
input="test input",
231+
output="test output",
232+
model_says_meets_spec=True,
233+
user_says_meets_spec=False,
234+
feedback="This fails because the output is too vague",
235+
)
236+
_, feedback_text = create_task_run_from_reviewed(
237+
example, "golden_tag", "My Spec"
238+
)
239+
assert feedback_text == "This fails because the output is too vague"
240+
241+
def test_returns_none_feedback_when_empty(self):
242+
example = ReviewedExample(
243+
input="test input",
244+
output="test output",
245+
model_says_meets_spec=True,
246+
user_says_meets_spec=True,
247+
feedback="",
248+
)
249+
_, feedback_text = create_task_run_from_reviewed(
250+
example, "golden_tag", "My Spec"
251+
)
252+
assert feedback_text is None
253+
228254

229255
class TestCreateDatasetTaskRuns:
230256
def test_creates_correct_number_of_task_runs(self):
@@ -241,7 +267,7 @@ def test_creates_correct_number_of_task_runs(self):
241267
"train_tag",
242268
"golden_tag",
243269
"Test Spec",
244-
)
270+
).task_runs
245271

246272
# Should have NUM_SAMPLES_PER_TOPIC * NUM_TOPICS
247273
expected_count = NUM_SAMPLES_PER_TOPIC * NUM_TOPICS
@@ -269,7 +295,7 @@ def test_includes_reviewed_examples_in_golden_set(self):
269295
"train_tag",
270296
"golden_tag",
271297
"Test Spec",
272-
)
298+
).task_runs
273299

274300
# Find the reviewed example in task runs
275301
reviewed_run = next(
@@ -292,7 +318,7 @@ def test_all_task_runs_have_session_tag(self):
292318
"train_tag",
293319
"golden_tag",
294320
"Test Spec",
295-
)
321+
).task_runs
296322

297323
# All task runs should have a session tag
298324
for task_run in task_runs:
@@ -315,7 +341,7 @@ def test_same_session_tag_for_all_runs(self):
315341
"train_tag",
316342
"golden_tag",
317343
"Test Spec",
318-
)
344+
).task_runs
319345

320346
# All task runs should have the same session tag
321347
session_tags = set()
@@ -340,7 +366,7 @@ def test_eval_examples_have_eval_tag(self):
340366
"train_tag",
341367
"golden_tag",
342368
"Test Spec",
343-
)
369+
).task_runs
344370

345371
eval_runs = [tr for tr in task_runs if "eval_tag" in tr.tags]
346372
num_runs = NUM_SAMPLES_PER_TOPIC * NUM_TOPICS
@@ -361,7 +387,7 @@ def test_train_examples_have_train_tag(self):
361387
"train_tag",
362388
"golden_tag",
363389
"Test Spec",
364-
)
390+
).task_runs
365391

366392
train_runs = [tr for tr in task_runs if "train_tag" in tr.tags]
367393
num_runs = NUM_SAMPLES_PER_TOPIC * NUM_TOPICS
@@ -383,7 +409,7 @@ def test_handles_insufficient_examples(self):
383409
"train_tag",
384410
"golden_tag",
385411
"Test Spec",
386-
)
412+
).task_runs
387413

388414
# Should use all available examples
389415
assert len(task_runs) == 5

0 commit comments

Comments
 (0)