Skip to content

Commit d469cec

Browse files
sfierroclaude
andcommitted
KIL-534 Add Feedback data model on TaskRun
Replace the single `user_feedback` string field on TaskRun with a proper Feedback model that supports multiple feedback entries per run. Feedback is a parented model under TaskRun, stored as separate files to avoid write conflicts when multiple people provide feedback. - Add Feedback model (feedback text + FeedbackSource enum) - Make TaskRun a parent model with feedback children - Remove user_feedback field from TaskRun - Add REST API endpoints (list/create) for feedback on task runs - Update copilot models, utils, and frontend spec builder - Create follow-up ticket KIL-537 for repair UI replacement Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent e682c0e commit d469cec

File tree

15 files changed

+610
-59
lines changed

15 files changed

+610
-59
lines changed

app/desktop/studio_server/api_client/kiln_ai_server_client/models/examples_with_feedback_item.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,14 @@ class ExamplesWithFeedbackItem:
1818
input_ (str):
1919
output (str):
2020
fails_specification (bool): Judge's verdict - whether the output fails the Target Specification
21-
user_feedback (None | str | Unset): Optional text feedback from the user
21+
feedback (None | str | Unset): Optional text feedback from the user
2222
"""
2323

2424
user_agrees_with_judge: bool
2525
input_: str
2626
output: str
2727
fails_specification: bool
28-
user_feedback: None | str | Unset = UNSET
28+
feedback: None | str | Unset = UNSET
2929

3030
def to_dict(self) -> dict[str, Any]:
3131
user_agrees_with_judge = self.user_agrees_with_judge
@@ -36,11 +36,11 @@ def to_dict(self) -> dict[str, Any]:
3636

3737
fails_specification = self.fails_specification
3838

39-
user_feedback: None | str | Unset
40-
if isinstance(self.user_feedback, Unset):
41-
user_feedback = UNSET
39+
feedback: None | str | Unset
40+
if isinstance(self.feedback, Unset):
41+
feedback = UNSET
4242
else:
43-
user_feedback = self.user_feedback
43+
feedback = self.feedback
4444

4545
field_dict: dict[str, Any] = {}
4646

@@ -52,8 +52,8 @@ def to_dict(self) -> dict[str, Any]:
5252
"fails_specification": fails_specification,
5353
}
5454
)
55-
if user_feedback is not UNSET:
56-
field_dict["user_feedback"] = user_feedback
55+
if feedback is not UNSET:
56+
field_dict["feedback"] = feedback
5757

5858
return field_dict
5959

@@ -68,21 +68,21 @@ def from_dict(cls: type[T], src_dict: Mapping[str, Any]) -> T:
6868

6969
fails_specification = d.pop("fails_specification")
7070

71-
def _parse_user_feedback(data: object) -> None | str | Unset:
71+
def _parse_feedback(data: object) -> None | str | Unset:
7272
if data is None:
7373
return data
7474
if isinstance(data, Unset):
7575
return data
7676
return cast(None | str | Unset, data)
7777

78-
user_feedback = _parse_user_feedback(d.pop("user_feedback", UNSET))
78+
feedback = _parse_feedback(d.pop("feedback", UNSET))
7979

8080
examples_with_feedback_item = cls(
8181
user_agrees_with_judge=user_agrees_with_judge,
8282
input_=input_,
8383
output=output,
8484
fails_specification=fails_specification,
85-
user_feedback=user_feedback,
85+
feedback=feedback,
8686
)
8787

8888
return examples_with_feedback_item

app/desktop/studio_server/api_models/copilot_models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ class ExampleWithFeedbackApi(BaseModel):
7979
input: str = Field(alias="input")
8080
output: str
8181
fails_specification: bool
82-
user_feedback: str | None = None
82+
feedback: str | None = None
8383

8484

8585
class ClarifySpecApiInput(BaseModel):

app/desktop/studio_server/api_models/test_copilot_models.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -153,24 +153,24 @@ def test_creates_with_required_fields(self):
153153
assert example.output == "test output"
154154
assert example.fails_specification is False
155155

156-
def test_user_feedback_optional(self):
156+
def test_feedback_optional(self):
157157
example = ExampleWithFeedbackApi(
158158
user_agrees_with_judge=True,
159159
input="test input",
160160
output="test output",
161161
fails_specification=False,
162162
)
163-
assert example.user_feedback is None
163+
assert example.feedback is None
164164

165-
def test_user_feedback_can_be_set(self):
165+
def test_feedback_can_be_set(self):
166166
example = ExampleWithFeedbackApi(
167167
user_agrees_with_judge=False,
168168
input="test input",
169169
output="test output",
170170
fails_specification=True,
171-
user_feedback="This is wrong because...",
171+
feedback="This is wrong because...",
172172
)
173-
assert example.user_feedback == "This is wrong because..."
173+
assert example.feedback == "This is wrong because..."
174174

175175

176176
class TestClarifySpecApiInput:

app/desktop/studio_server/utils/copilot_utils.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,6 @@ def create_task_run_from_reviewed(
207207
},
208208
),
209209
),
210-
user_feedback=example.feedback if example.feedback else None,
211210
tags=tags,
212211
)
213212

app/desktop/studio_server/utils/test_copilot_utils.py

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -225,28 +225,6 @@ def test_creates_task_run_with_pass_fail_rating_type(self):
225225
== TaskOutputRatingType.pass_fail
226226
)
227227

228-
def test_creates_task_run_with_user_feedback(self):
229-
example = ReviewedExample(
230-
input="test input",
231-
output="test output",
232-
model_says_meets_spec=True,
233-
user_says_meets_spec=False,
234-
feedback="This fails because the output is too vague",
235-
)
236-
task_run = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
237-
assert task_run.user_feedback == "This fails because the output is too vague"
238-
239-
def test_creates_task_run_with_no_user_feedback_when_empty(self):
240-
example = ReviewedExample(
241-
input="test input",
242-
output="test output",
243-
model_says_meets_spec=True,
244-
user_says_meets_spec=True,
245-
feedback="",
246-
)
247-
task_run = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
248-
assert task_run.user_feedback is None
249-
250228

251229
class TestCreateDatasetTaskRuns:
252230
def test_creates_correct_number_of_task_runs(self):

app/web_ui/src/lib/api_schema.d.ts

Lines changed: 165 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,24 @@ export interface paths {
378378
patch?: never;
379379
trace?: never;
380380
};
381+
"/api/projects/{project_id}/tasks/{task_id}/runs/{run_id}/feedback": {
382+
parameters: {
383+
query?: never;
384+
header?: never;
385+
path?: never;
386+
cookie?: never;
387+
};
388+
/** List Feedback */
389+
get: operations["list_feedback_api_projects__project_id__tasks__task_id__runs__run_id__feedback_get"];
390+
put?: never;
391+
/** Create Feedback */
392+
post: operations["create_feedback_api_projects__project_id__tasks__task_id__runs__run_id__feedback_post"];
393+
delete?: never;
394+
options?: never;
395+
head?: never;
396+
patch?: never;
397+
trace?: never;
398+
};
381399
"/api/projects/{project_id}/documents/bulk": {
382400
parameters: {
383401
query?: never;
@@ -3385,6 +3403,19 @@ export interface components {
33853403
/** @description The properties of the extractor config, specific to the selected extractor_type. */
33863404
properties: components["schemas"]["LitellmExtractorConfigProperties"];
33873405
};
3406+
/**
3407+
* CreateFeedbackRequest
3408+
* @description Request body for creating feedback on a task run.
3409+
*/
3410+
CreateFeedbackRequest: {
3411+
/**
3412+
* Feedback
3413+
* @description Free-form text feedback on the task run.
3414+
*/
3415+
feedback: string;
3416+
/** @description Where this feedback originated. */
3417+
source: components["schemas"]["FeedbackSource"];
3418+
};
33883419
/**
33893420
* CreateFinetuneRequest
33903421
* @description Request to create a finetune
@@ -4630,8 +4661,8 @@ export interface components {
46304661
output: string;
46314662
/** Fails Specification */
46324663
fails_specification: boolean;
4633-
/** User Feedback */
4634-
user_feedback?: string | null;
4664+
/** Feedback */
4665+
feedback?: string | null;
46354666
};
46364667
/**
46374668
* ExternalToolApiDescription
@@ -4950,6 +4981,62 @@ export interface components {
49504981
/** Factually Inaccurate Examples */
49514982
factually_inaccurate_examples: string;
49524983
};
4984+
/**
4985+
* Feedback
4986+
* @description Feedback on a task run.
4987+
*
4988+
* Supports multi-source feedback: different users, automated systems, and
4989+
* different locations in the UI can each contribute independent feedback
4990+
* entries on the same task run.
4991+
*/
4992+
Feedback: {
4993+
/**
4994+
* V
4995+
* @description Schema version for migration support.
4996+
* @default 1
4997+
*/
4998+
v: number;
4999+
/**
5000+
* Id
5001+
* @description Unique identifier for this record.
5002+
*/
5003+
id?: string | null;
5004+
/**
5005+
* Path
5006+
* @description File system path where the record is stored.
5007+
*/
5008+
path?: string | null;
5009+
/**
5010+
* Created At
5011+
* Format: date-time
5012+
* @description Timestamp when the model was created.
5013+
*/
5014+
created_at?: string;
5015+
/**
5016+
* Created By
5017+
* @description User ID of the creator.
5018+
*/
5019+
created_by?: string;
5020+
/**
5021+
* Feedback
5022+
* @description Free-form text feedback on the task run.
5023+
*/
5024+
feedback: string;
5025+
/** @description Where this feedback originated, e.g. 'run-page' or 'spec-feedback'. */
5026+
source: components["schemas"]["FeedbackSource"];
5027+
/** Model Type */
5028+
readonly model_type: string;
5029+
};
5030+
/**
5031+
* FeedbackSource
5032+
* @description Where a piece of feedback originated.
5033+
*
5034+
* This is an append-only enum: new sources can be added freely, but existing
5035+
* values must never be removed or renamed so that older persisted data
5036+
* continues to load.
5037+
* @enum {string}
5038+
*/
5039+
FeedbackSource: "run-page" | "spec-feedback";
49535040
/**
49545041
* FewShotExample
49555042
* @description An input/output example for few-shot prompting.
@@ -8055,11 +8142,6 @@ export interface components {
80558142
* @description Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models.
80568143
*/
80578144
repair_instructions?: string | null;
8058-
/**
8059-
* User Feedback
8060-
* @description User feedback from the spec review process explaining why the output passes or fails a requirement.
8061-
*/
8062-
user_feedback?: string | null;
80638145
/** @description An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field. */
80648146
repaired_output?: components["schemas"]["TaskOutput-Input"] | null;
80658147
/**
@@ -8137,11 +8219,6 @@ export interface components {
81378219
* @description Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models.
81388220
*/
81398221
repair_instructions?: string | null;
8140-
/**
8141-
* User Feedback
8142-
* @description User feedback from the spec review process explaining why the output passes or fails a requirement.
8143-
*/
8144-
user_feedback?: string | null;
81458222
/** @description An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field. */
81468223
repaired_output?: components["schemas"]["TaskOutput-Output"] | null;
81478224
/**
@@ -9813,6 +9890,82 @@ export interface operations {
98139890
};
98149891
};
98159892
};
9893+
list_feedback_api_projects__project_id__tasks__task_id__runs__run_id__feedback_get: {
9894+
parameters: {
9895+
query?: never;
9896+
header?: never;
9897+
path: {
9898+
/** @description The unique identifier of the project. */
9899+
project_id: string;
9900+
/** @description The unique identifier of the task within the project. */
9901+
task_id: string;
9902+
/** @description The unique identifier of the task run. */
9903+
run_id: string;
9904+
};
9905+
cookie?: never;
9906+
};
9907+
requestBody?: never;
9908+
responses: {
9909+
/** @description Successful Response */
9910+
200: {
9911+
headers: {
9912+
[name: string]: unknown;
9913+
};
9914+
content: {
9915+
"application/json": components["schemas"]["Feedback"][];
9916+
};
9917+
};
9918+
/** @description Validation Error */
9919+
422: {
9920+
headers: {
9921+
[name: string]: unknown;
9922+
};
9923+
content: {
9924+
"application/json": components["schemas"]["HTTPValidationError"];
9925+
};
9926+
};
9927+
};
9928+
};
9929+
create_feedback_api_projects__project_id__tasks__task_id__runs__run_id__feedback_post: {
9930+
parameters: {
9931+
query?: never;
9932+
header?: never;
9933+
path: {
9934+
/** @description The unique identifier of the project. */
9935+
project_id: string;
9936+
/** @description The unique identifier of the task within the project. */
9937+
task_id: string;
9938+
/** @description The unique identifier of the task run. */
9939+
run_id: string;
9940+
};
9941+
cookie?: never;
9942+
};
9943+
requestBody: {
9944+
content: {
9945+
"application/json": components["schemas"]["CreateFeedbackRequest"];
9946+
};
9947+
};
9948+
responses: {
9949+
/** @description Successful Response */
9950+
200: {
9951+
headers: {
9952+
[name: string]: unknown;
9953+
};
9954+
content: {
9955+
"application/json": components["schemas"]["Feedback"];
9956+
};
9957+
};
9958+
/** @description Validation Error */
9959+
422: {
9960+
headers: {
9961+
[name: string]: unknown;
9962+
};
9963+
content: {
9964+
"application/json": components["schemas"]["HTTPValidationError"];
9965+
};
9966+
};
9967+
};
9968+
};
98169969
create_documents_bulk_api_projects__project_id__documents_bulk_post: {
98179970
parameters: {
98189971
query?: never;

app/web_ui/src/routes/(app)/specs/[project_id]/[task_id]/spec_builder/+page.svelte

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -547,7 +547,7 @@
547547
const examples_with_feedback = currentExamples.map((example) => ({
548548
user_agrees_with_judge:
549549
example.model_says_meets_spec === example.user_says_meets_spec,
550-
user_feedback: example.feedback,
550+
feedback: example.feedback,
551551
input: example.input,
552552
output: example.output,
553553
fails_specification: !example.user_says_meets_spec,

0 commit comments

Comments
 (0)