Skip to content

Commit 9c20b38

Browse files
authored
Merge pull request #1220 from Kiln-AI/KIL-483/persist-spec-fb
Persist spec judge incorrect user feedback on task run
2 parents ce96dbc + 3c3cf3d commit 9c20b38

File tree

5 files changed

+39
-2
lines changed

5 files changed

+39
-2
lines changed

app/desktop/studio_server/utils/copilot_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,7 @@ def create_task_run_from_reviewed(
207207
},
208208
),
209209
),
210+
user_feedback=example.feedback if example.feedback else None,
210211
tags=tags,
211212
)
212213

app/desktop/studio_server/utils/test_copilot_utils.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,28 @@ def test_creates_task_run_with_pass_fail_rating_type(self):
225225
== TaskOutputRatingType.pass_fail
226226
)
227227

228+
def test_creates_task_run_with_user_feedback(self):
229+
example = ReviewedExample(
230+
input="test input",
231+
output="test output",
232+
model_says_meets_spec=True,
233+
user_says_meets_spec=False,
234+
feedback="This fails because the output is too vague",
235+
)
236+
task_run = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
237+
assert task_run.user_feedback == "This fails because the output is too vague"
238+
239+
def test_creates_task_run_with_no_user_feedback_when_empty(self):
240+
example = ReviewedExample(
241+
input="test input",
242+
output="test output",
243+
model_says_meets_spec=True,
244+
user_says_meets_spec=True,
245+
feedback="",
246+
)
247+
task_run = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
248+
assert task_run.user_feedback is None
249+
228250

229251
class TestCreateDatasetTaskRuns:
230252
def test_creates_correct_number_of_task_runs(self):

app/web_ui/src/lib/api_schema.d.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8051,6 +8051,11 @@ export interface components {
80518051
* @description Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models.
80528052
*/
80538053
repair_instructions?: string | null;
8054+
/**
8055+
* User Feedback
8056+
* @description User feedback from the spec review process explaining why the output passes or fails a requirement.
8057+
*/
8058+
user_feedback?: string | null;
80548059
/** @description An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field. */
80558060
repaired_output?: components["schemas"]["TaskOutput-Input"] | null;
80568061
/**
@@ -8128,6 +8133,11 @@ export interface components {
81288133
* @description Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models.
81298134
*/
81308135
repair_instructions?: string | null;
8136+
/**
8137+
* User Feedback
8138+
* @description User feedback from the spec review process explaining why the output passes or fails a requirement.
8139+
*/
8140+
user_feedback?: string | null;
81318141
/** @description An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field. */
81328142
repaired_output?: components["schemas"]["TaskOutput-Output"] | null;
81338143
/**

libs/core/kiln_ai/datamodel/task_run.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,10 @@ class TaskRun(KilnParentedModel):
9898
default=None,
9999
description="Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models.",
100100
)
101+
user_feedback: str | None = Field(
102+
default=None,
103+
description="User feedback from the spec review process explaining why the output passes or fails a requirement.",
104+
)
101105
repaired_output: TaskOutput | None = Field(
102106
default=None,
103107
description="An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field.",

utils/pre-commit-hook

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Copy this file to .git/hooks/pre-commit to run Kiln checks before each commit
44
#
55

6-
cd "$(dirname "$0")"
6+
cd "$(git rev-parse --show-toplevel)"
77
# --staged-only is useful to only run checks on the files that are staged for commit
8-
uv run ../../checks.sh --staged-only
8+
uv run ./checks.sh --staged-only
99

0 commit comments

Comments
 (0)