Merge pull request #1220 from Kiln-AI/KIL-483/persist-spec-fb

sfierro · web-flow · commit 9c20b3855ef8 · 2026-04-06T01:34:48.000-07:00
Persist spec judge incorrect user feedback on task run
diff --git a/app/desktop/studio_server/utils/copilot_utils.py b/app/desktop/studio_server/utils/copilot_utils.py
@@ -207,6 +207,7 @@ def create_task_run_from_reviewed(
                 },
             ),
         ),
+        user_feedback=example.feedback if example.feedback else None,
         tags=tags,
     )
 
diff --git a/app/desktop/studio_server/utils/test_copilot_utils.py b/app/desktop/studio_server/utils/test_copilot_utils.py
@@ -225,6 +225,28 @@ def test_creates_task_run_with_pass_fail_rating_type(self):
             == TaskOutputRatingType.pass_fail
         )
 
+    def test_creates_task_run_with_user_feedback(self):
+        example = ReviewedExample(
+            input="test input",
+            output="test output",
+            model_says_meets_spec=True,
+            user_says_meets_spec=False,
+            feedback="This fails because the output is too vague",
+        )
+        task_run = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
+        assert task_run.user_feedback == "This fails because the output is too vague"
+
+    def test_creates_task_run_with_no_user_feedback_when_empty(self):
+        example = ReviewedExample(
+            input="test input",
+            output="test output",
+            model_says_meets_spec=True,
+            user_says_meets_spec=True,
+            feedback="",
+        )
+        task_run = create_task_run_from_reviewed(example, "golden_tag", "My Spec")
+        assert task_run.user_feedback is None
+
 
 class TestCreateDatasetTaskRuns:
     def test_creates_correct_number_of_task_runs(self):
diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts
@@ -8051,6 +8051,11 @@ export interface components {
              * @description Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models.
              */
             repair_instructions?: string | null;
+            /**
+             * User Feedback
+             * @description User feedback from the spec review process explaining why the output passes or fails a requirement.
+             */
+            user_feedback?: string | null;
             /** @description An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field. */
             repaired_output?: components["schemas"]["TaskOutput-Input"] | null;
             /**
@@ -8128,6 +8133,11 @@ export interface components {
              * @description Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models.
              */
             repair_instructions?: string | null;
+            /**
+             * User Feedback
+             * @description User feedback from the spec review process explaining why the output passes or fails a requirement.
+             */
+            user_feedback?: string | null;
             /** @description An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field. */
             repaired_output?: components["schemas"]["TaskOutput-Output"] | null;
             /**
diff --git a/libs/core/kiln_ai/datamodel/task_run.py b/libs/core/kiln_ai/datamodel/task_run.py
@@ -98,6 +98,10 @@ class TaskRun(KilnParentedModel):
         default=None,
         description="Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models.",
     )
+    user_feedback: str | None = Field(
+        default=None,
+        description="User feedback from the spec review process explaining why the output passes or fails a requirement.",
+    )
     repaired_output: TaskOutput | None = Field(
         default=None,
         description="An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field.",
diff --git a/utils/pre-commit-hook b/utils/pre-commit-hook
@@ -3,7 +3,7 @@
 # Copy this file to .git/hooks/pre-commit to run Kiln checks before each commit
 #
 
-cd "$(dirname "$0")"
+cd "$(git rev-parse --show-toplevel)"
 # --staged-only is useful to only run checks on the files that are staged for commit
-uv run ../../checks.sh --staged-only
+uv run ./checks.sh --staged-only
 

Original file line number	Diff line number	Diff line change
`@@ -207,6 +207,7 @@ def create_task_run_from_reviewed(`
`207`	`207`	`},`
`208`	`208`	`),`
`209`	`209`	`),`
	`210`	`+ user_feedback=example.feedback if example.feedback else None,`
`210`	`211`	`tags=tags,`
`211`	`212`	`)`
`212`	`213`
Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,7 @@`
`3`	`3`	`# Copy this file to .git/hooks/pre-commit to run Kiln checks before each commit`
`4`	`4`	`#`
`5`	`5`
`6`		`-cd "$(dirname "$0")"`
	`6`	`+cd "$(git rev-parse --show-toplevel)"`
`7`	`7`	`# --staged-only is useful to only run checks on the files that are staged for commit`
`8`		`-uv run ../../checks.sh --staged-only`
	`8`	`+uv run ./checks.sh --staged-only`
`9`	`9`