Skip to content

Commit 3e98d91

Browse files
Merge branch 'main' into mike/update-claude-maintain
2 parents 8a3204d + a358e73 commit 3e98d91

33 files changed

+1407
-641
lines changed

app/desktop/studio_server/copilot_api.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -370,14 +370,15 @@ async def create_spec_with_copilot(
370370
)
371371

372372
# 4. Create TaskRuns for eval, train, and golden datasets
373-
task_runs = create_dataset_task_runs(
373+
dataset_runs = create_dataset_task_runs(
374374
all_examples=all_examples,
375375
reviewed_examples=request.reviewed_examples,
376376
eval_tag=eval_tag,
377377
train_tag=train_tag,
378378
golden_tag=golden_tag,
379379
spec_name=request.name,
380380
)
381+
task_runs = dataset_runs.task_runs
381382
for run in task_runs:
382383
run.parent = task
383384
models_to_save.extend(task_runs)
@@ -430,6 +431,7 @@ async def create_spec_with_copilot(
430431
for run in task_runs:
431432
run.save_to_file()
432433
saved_models.append(run)
434+
dataset_runs.save_pending_feedback(run)
433435

434436
spec.save_to_file()
435437
saved_models.append(spec)

app/desktop/studio_server/test_copilot_api.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
RefineSpecApiOutput,
1212
)
1313
from app.desktop.studio_server.copilot_api import connect_copilot_api
14+
from app.desktop.studio_server.utils.copilot_utils import DatasetTaskRuns
1415
from fastapi import FastAPI
1516
from fastapi.testclient import TestClient
1617
from kiln_ai.datamodel import Project, Task
@@ -417,7 +418,7 @@ def test_create_spec_with_copilot_success(
417418
),
418419
patch(
419420
"app.desktop.studio_server.copilot_api.create_dataset_task_runs",
420-
return_value=[],
421+
return_value=DatasetTaskRuns(),
421422
),
422423
patch(
423424
"app.desktop.studio_server.copilot_api.generate_memorable_name",

app/desktop/studio_server/utils/copilot_utils.py

Lines changed: 47 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
)
2626
from app.desktop.studio_server.utils.response_utils import unwrap_response
2727
from fastapi import HTTPException
28-
from kiln_ai.datamodel import TaskRun
28+
from kiln_ai.datamodel import Feedback, FeedbackSource, TaskRun
2929
from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType
3030
from kiln_ai.datamodel.task_output import (
3131
DataSource,
@@ -172,8 +172,12 @@ def create_task_run_from_reviewed(
172172
tag: str,
173173
spec_name: str,
174174
extra_tags: list[str] | None = None,
175-
) -> TaskRun:
176-
"""Create a TaskRun from a reviewed example with rating (without parent set)."""
175+
) -> tuple[TaskRun, str | None]:
176+
"""Create a TaskRun from a reviewed example with rating (without parent set).
177+
178+
Returns a (TaskRun, feedback_text) tuple. The caller should create a Feedback
179+
child on the TaskRun after saving it, if feedback_text is not None.
180+
"""
177181
data_source = DataSource(
178182
type=DataSourceType.synthetic,
179183
properties={
@@ -190,7 +194,7 @@ def create_task_run_from_reviewed(
190194
rating_key = f"named::{spec_name}"
191195
rating_value = 1.0 if example.user_says_meets_spec else 0.0
192196

193-
return TaskRun(
197+
task_run = TaskRun(
194198
input=example.input,
195199
input_source=data_source,
196200
output=TaskOutput(
@@ -207,9 +211,36 @@ def create_task_run_from_reviewed(
207211
},
208212
),
209213
),
210-
user_feedback=example.feedback if example.feedback else None,
211214
tags=tags,
212215
)
216+
feedback_text = example.feedback if example.feedback else None
217+
return task_run, feedback_text
218+
219+
220+
class DatasetTaskRuns:
221+
"""Result of creating dataset task runs, with pending feedback to attach after saving."""
222+
223+
def __init__(self) -> None:
224+
self.task_runs: list[TaskRun] = []
225+
self._pending_feedback: dict[str, str] = {}
226+
227+
def add_run(self, task_run: TaskRun, feedback_text: str | None = None) -> None:
228+
self.task_runs.append(task_run)
229+
if feedback_text and task_run.id:
230+
self._pending_feedback[task_run.id] = feedback_text
231+
232+
def save_pending_feedback(self, task_run: TaskRun) -> None:
233+
"""Create Feedback children for a saved TaskRun if it has pending feedback."""
234+
if not task_run.id:
235+
return
236+
feedback_text = self._pending_feedback.get(task_run.id)
237+
if feedback_text:
238+
fb = Feedback(
239+
feedback=feedback_text,
240+
source=FeedbackSource.spec_feedback,
241+
parent=task_run,
242+
)
243+
fb.save_to_file()
213244

214245

215246
def create_dataset_task_runs(
@@ -219,17 +250,18 @@ def create_dataset_task_runs(
219250
train_tag: str,
220251
golden_tag: str,
221252
spec_name: str,
222-
) -> list[TaskRun]:
253+
) -> DatasetTaskRuns:
223254
"""Create TaskRuns for eval, train, and golden datasets.
224255
225256
Samples from all_examples (mutating it) and creates TaskRuns for:
226257
- Eval dataset
227258
- Train dataset
228259
- Golden dataset (reviewed examples + unrated examples to reach MIN_GOLDEN_EXAMPLES)
229260
230-
Returns TaskRuns without parent set - caller must set parent.
261+
Returns DatasetTaskRuns without parent set - caller must set parent and call
262+
save_pending_feedback after saving each run.
231263
"""
232-
task_runs: list[TaskRun] = []
264+
result = DatasetTaskRuns()
233265

234266
# Generate a session tag for all task runs in this batch
235267
session_id = random.randint(0, 999999999999)
@@ -238,18 +270,17 @@ def create_dataset_task_runs(
238270

239271
# Create TaskRuns for reviewed examples with ratings
240272
for reviewed in reviewed_examples:
241-
task_runs.append(
242-
create_task_run_from_reviewed(reviewed, golden_tag, spec_name, extra_tags)
273+
task_run, feedback_text = create_task_run_from_reviewed(
274+
reviewed, golden_tag, spec_name, extra_tags
243275
)
276+
result.add_run(task_run, feedback_text)
244277

245278
# Create more unrated golden examples from remaining pool if needed
246279
unrated_golden_count = max(0, MIN_GOLDEN_EXAMPLES - len(reviewed_examples))
247280
if unrated_golden_count > 0:
248281
unrated_golden_examples = sample_and_remove(all_examples, unrated_golden_count)
249282
for example in unrated_golden_examples:
250-
task_runs.append(
251-
create_task_run_from_sample(example, golden_tag, extra_tags)
252-
)
283+
result.add_run(create_task_run_from_sample(example, golden_tag, extra_tags))
253284

254285
# Sample half the remaining examples for eval vs train datasets
255286
example_count = len(all_examples)
@@ -260,10 +291,10 @@ def create_dataset_task_runs(
260291

261292
# Create TaskRuns for eval examples
262293
for example in eval_examples:
263-
task_runs.append(create_task_run_from_sample(example, eval_tag, extra_tags))
294+
result.add_run(create_task_run_from_sample(example, eval_tag, extra_tags))
264295

265296
# Create TaskRuns for train examples
266297
for example in train_examples:
267-
task_runs.append(create_task_run_from_sample(example, train_tag, extra_tags))
298+
result.add_run(create_task_run_from_sample(example, train_tag, extra_tags))
268299

269-
return task_runs
300+
return result

app/desktop/studio_server/utils/test_copilot_utils.py

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ def test_creates_task_run_with_correct_input(self):
145145
user_says_meets_spec=True,
146146
feedback="Good example",
147147
)
148-
task_run = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
148+
task_run, _ = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
149149
assert task_run.input == "test input"
150150

151151
def test_creates_task_run_with_correct_output(self):
@@ -156,7 +156,7 @@ def test_creates_task_run_with_correct_output(self):
156156
user_says_meets_spec=True,
157157
feedback="",
158158
)
159-
task_run = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
159+
task_run, _ = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
160160
assert task_run.output.output == "test output"
161161

162162
def test_creates_task_run_with_pass_rating_when_meets_spec(self):
@@ -167,7 +167,7 @@ def test_creates_task_run_with_pass_rating_when_meets_spec(self):
167167
user_says_meets_spec=True,
168168
feedback="",
169169
)
170-
task_run = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
170+
task_run, _ = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
171171
rating_key = "named::My Spec"
172172
assert rating_key in task_run.output.rating.requirement_ratings
173173
assert task_run.output.rating.requirement_ratings[rating_key].value == 1.0
@@ -180,7 +180,7 @@ def test_creates_task_run_with_fail_rating_when_not_meets_spec(self):
180180
user_says_meets_spec=False,
181181
feedback="Bad example",
182182
)
183-
task_run = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
183+
task_run, _ = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
184184
rating_key = "named::My Spec"
185185
assert rating_key in task_run.output.rating.requirement_ratings
186186
assert task_run.output.rating.requirement_ratings[rating_key].value == 0.0
@@ -193,7 +193,7 @@ def test_creates_task_run_with_tag(self):
193193
user_says_meets_spec=True,
194194
feedback="",
195195
)
196-
task_run = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
196+
task_run, _ = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
197197
assert "golden_tag" in task_run.tags
198198

199199
def test_creates_task_run_with_extra_tags(self):
@@ -204,7 +204,7 @@ def test_creates_task_run_with_extra_tags(self):
204204
user_says_meets_spec=True,
205205
feedback="",
206206
)
207-
task_run = create_task_run_from_reviewed(
207+
task_run, _ = create_task_run_from_reviewed(
208208
example, "golden_tag", "My Spec", extra_tags=["session_456"]
209209
)
210210
assert "golden_tag" in task_run.tags
@@ -218,34 +218,38 @@ def test_creates_task_run_with_pass_fail_rating_type(self):
218218
user_says_meets_spec=True,
219219
feedback="",
220220
)
221-
task_run = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
221+
task_run, _ = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
222222
rating_key = "named::My Spec"
223223
assert (
224224
task_run.output.rating.requirement_ratings[rating_key].type
225225
== TaskOutputRatingType.pass_fail
226226
)
227227

228-
def test_creates_task_run_with_user_feedback(self):
228+
def test_returns_feedback_text_when_present(self):
229229
example = ReviewedExample(
230230
input="test input",
231231
output="test output",
232232
model_says_meets_spec=True,
233233
user_says_meets_spec=False,
234234
feedback="This fails because the output is too vague",
235235
)
236-
task_run = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
237-
assert task_run.user_feedback == "This fails because the output is too vague"
236+
_, feedback_text = create_task_run_from_reviewed(
237+
example, "golden_tag", "My Spec"
238+
)
239+
assert feedback_text == "This fails because the output is too vague"
238240

239-
def test_creates_task_run_with_no_user_feedback_when_empty(self):
241+
def test_returns_none_feedback_when_empty(self):
240242
example = ReviewedExample(
241243
input="test input",
242244
output="test output",
243245
model_says_meets_spec=True,
244246
user_says_meets_spec=True,
245247
feedback="",
246248
)
247-
task_run = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
248-
assert task_run.user_feedback is None
249+
_, feedback_text = create_task_run_from_reviewed(
250+
example, "golden_tag", "My Spec"
251+
)
252+
assert feedback_text is None
249253

250254

251255
class TestCreateDatasetTaskRuns:
@@ -263,7 +267,7 @@ def test_creates_correct_number_of_task_runs(self):
263267
"train_tag",
264268
"golden_tag",
265269
"Test Spec",
266-
)
270+
).task_runs
267271

268272
# Should have NUM_SAMPLES_PER_TOPIC * NUM_TOPICS
269273
expected_count = NUM_SAMPLES_PER_TOPIC * NUM_TOPICS
@@ -291,7 +295,7 @@ def test_includes_reviewed_examples_in_golden_set(self):
291295
"train_tag",
292296
"golden_tag",
293297
"Test Spec",
294-
)
298+
).task_runs
295299

296300
# Find the reviewed example in task runs
297301
reviewed_run = next(
@@ -314,7 +318,7 @@ def test_all_task_runs_have_session_tag(self):
314318
"train_tag",
315319
"golden_tag",
316320
"Test Spec",
317-
)
321+
).task_runs
318322

319323
# All task runs should have a session tag
320324
for task_run in task_runs:
@@ -337,7 +341,7 @@ def test_same_session_tag_for_all_runs(self):
337341
"train_tag",
338342
"golden_tag",
339343
"Test Spec",
340-
)
344+
).task_runs
341345

342346
# All task runs should have the same session tag
343347
session_tags = set()
@@ -362,7 +366,7 @@ def test_eval_examples_have_eval_tag(self):
362366
"train_tag",
363367
"golden_tag",
364368
"Test Spec",
365-
)
369+
).task_runs
366370

367371
eval_runs = [tr for tr in task_runs if "eval_tag" in tr.tags]
368372
num_runs = NUM_SAMPLES_PER_TOPIC * NUM_TOPICS
@@ -383,7 +387,7 @@ def test_train_examples_have_train_tag(self):
383387
"train_tag",
384388
"golden_tag",
385389
"Test Spec",
386-
)
390+
).task_runs
387391

388392
train_runs = [tr for tr in task_runs if "train_tag" in tr.tags]
389393
num_runs = NUM_SAMPLES_PER_TOPIC * NUM_TOPICS
@@ -405,7 +409,7 @@ def test_handles_insufficient_examples(self):
405409
"train_tag",
406410
"golden_tag",
407411
"Test Spec",
408-
)
412+
).task_runs
409413

410414
# Should use all available examples
411415
assert len(task_runs) == 5

0 commit comments

Comments
 (0)