@@ -145,7 +145,7 @@ def test_creates_task_run_with_correct_input(self):
145145 user_says_meets_spec = True ,
146146 feedback = "Good example" ,
147147 )
148- task_run = create_task_run_from_reviewed (example , "golden_tag" , "My Spec" )
148+ task_run , _ = create_task_run_from_reviewed (example , "golden_tag" , "My Spec" )
149149 assert task_run .input == "test input"
150150
151151 def test_creates_task_run_with_correct_output (self ):
@@ -156,7 +156,7 @@ def test_creates_task_run_with_correct_output(self):
156156 user_says_meets_spec = True ,
157157 feedback = "" ,
158158 )
159- task_run = create_task_run_from_reviewed (example , "golden_tag" , "My Spec" )
159+ task_run , _ = create_task_run_from_reviewed (example , "golden_tag" , "My Spec" )
160160 assert task_run .output .output == "test output"
161161
162162 def test_creates_task_run_with_pass_rating_when_meets_spec (self ):
@@ -167,7 +167,7 @@ def test_creates_task_run_with_pass_rating_when_meets_spec(self):
167167 user_says_meets_spec = True ,
168168 feedback = "" ,
169169 )
170- task_run = create_task_run_from_reviewed (example , "golden_tag" , "My Spec" )
170+ task_run , _ = create_task_run_from_reviewed (example , "golden_tag" , "My Spec" )
171171 rating_key = "named::My Spec"
172172 assert rating_key in task_run .output .rating .requirement_ratings
173173 assert task_run .output .rating .requirement_ratings [rating_key ].value == 1.0
@@ -180,7 +180,7 @@ def test_creates_task_run_with_fail_rating_when_not_meets_spec(self):
180180 user_says_meets_spec = False ,
181181 feedback = "Bad example" ,
182182 )
183- task_run = create_task_run_from_reviewed (example , "golden_tag" , "My Spec" )
183+ task_run , _ = create_task_run_from_reviewed (example , "golden_tag" , "My Spec" )
184184 rating_key = "named::My Spec"
185185 assert rating_key in task_run .output .rating .requirement_ratings
186186 assert task_run .output .rating .requirement_ratings [rating_key ].value == 0.0
@@ -193,7 +193,7 @@ def test_creates_task_run_with_tag(self):
193193 user_says_meets_spec = True ,
194194 feedback = "" ,
195195 )
196- task_run = create_task_run_from_reviewed (example , "golden_tag" , "My Spec" )
196+ task_run , _ = create_task_run_from_reviewed (example , "golden_tag" , "My Spec" )
197197 assert "golden_tag" in task_run .tags
198198
199199 def test_creates_task_run_with_extra_tags (self ):
@@ -204,7 +204,7 @@ def test_creates_task_run_with_extra_tags(self):
204204 user_says_meets_spec = True ,
205205 feedback = "" ,
206206 )
207- task_run = create_task_run_from_reviewed (
207+ task_run , _ = create_task_run_from_reviewed (
208208 example , "golden_tag" , "My Spec" , extra_tags = ["session_456" ]
209209 )
210210 assert "golden_tag" in task_run .tags
@@ -218,34 +218,38 @@ def test_creates_task_run_with_pass_fail_rating_type(self):
218218 user_says_meets_spec = True ,
219219 feedback = "" ,
220220 )
221- task_run = create_task_run_from_reviewed (example , "golden_tag" , "My Spec" )
221+ task_run , _ = create_task_run_from_reviewed (example , "golden_tag" , "My Spec" )
222222 rating_key = "named::My Spec"
223223 assert (
224224 task_run .output .rating .requirement_ratings [rating_key ].type
225225 == TaskOutputRatingType .pass_fail
226226 )
227227
228- def test_creates_task_run_with_user_feedback (self ):
228+ def test_returns_feedback_text_when_present (self ):
229229 example = ReviewedExample (
230230 input = "test input" ,
231231 output = "test output" ,
232232 model_says_meets_spec = True ,
233233 user_says_meets_spec = False ,
234234 feedback = "This fails because the output is too vague" ,
235235 )
236- task_run = create_task_run_from_reviewed (example , "golden_tag" , "My Spec" )
237- assert task_run .user_feedback == "This fails because the output is too vague"
236+ _ , feedback_text = create_task_run_from_reviewed (
237+ example , "golden_tag" , "My Spec"
238+ )
239+ assert feedback_text == "This fails because the output is too vague"
238240
239- def test_creates_task_run_with_no_user_feedback_when_empty (self ):
241+ def test_returns_none_feedback_when_empty (self ):
240242 example = ReviewedExample (
241243 input = "test input" ,
242244 output = "test output" ,
243245 model_says_meets_spec = True ,
244246 user_says_meets_spec = True ,
245247 feedback = "" ,
246248 )
247- task_run = create_task_run_from_reviewed (example , "golden_tag" , "My Spec" )
248- assert task_run .user_feedback is None
249+ _ , feedback_text = create_task_run_from_reviewed (
250+ example , "golden_tag" , "My Spec"
251+ )
252+ assert feedback_text is None
249253
250254
251255class TestCreateDatasetTaskRuns :
@@ -263,7 +267,7 @@ def test_creates_correct_number_of_task_runs(self):
263267 "train_tag" ,
264268 "golden_tag" ,
265269 "Test Spec" ,
266- )
270+ ). task_runs
267271
268272 # Should have NUM_SAMPLES_PER_TOPIC * NUM_TOPICS
269273 expected_count = NUM_SAMPLES_PER_TOPIC * NUM_TOPICS
@@ -291,7 +295,7 @@ def test_includes_reviewed_examples_in_golden_set(self):
291295 "train_tag" ,
292296 "golden_tag" ,
293297 "Test Spec" ,
294- )
298+ ). task_runs
295299
296300 # Find the reviewed example in task runs
297301 reviewed_run = next (
@@ -314,7 +318,7 @@ def test_all_task_runs_have_session_tag(self):
314318 "train_tag" ,
315319 "golden_tag" ,
316320 "Test Spec" ,
317- )
321+ ). task_runs
318322
319323 # All task runs should have a session tag
320324 for task_run in task_runs :
@@ -337,7 +341,7 @@ def test_same_session_tag_for_all_runs(self):
337341 "train_tag" ,
338342 "golden_tag" ,
339343 "Test Spec" ,
340- )
344+ ). task_runs
341345
342346 # All task runs should have the same session tag
343347 session_tags = set ()
@@ -362,7 +366,7 @@ def test_eval_examples_have_eval_tag(self):
362366 "train_tag" ,
363367 "golden_tag" ,
364368 "Test Spec" ,
365- )
369+ ). task_runs
366370
367371 eval_runs = [tr for tr in task_runs if "eval_tag" in tr .tags ]
368372 num_runs = NUM_SAMPLES_PER_TOPIC * NUM_TOPICS
@@ -383,7 +387,7 @@ def test_train_examples_have_train_tag(self):
383387 "train_tag" ,
384388 "golden_tag" ,
385389 "Test Spec" ,
386- )
390+ ). task_runs
387391
388392 train_runs = [tr for tr in task_runs if "train_tag" in tr .tags ]
389393 num_runs = NUM_SAMPLES_PER_TOPIC * NUM_TOPICS
@@ -405,7 +409,7 @@ def test_handles_insufficient_examples(self):
405409 "train_tag" ,
406410 "golden_tag" ,
407411 "Test Spec" ,
408- )
412+ ). task_runs
409413
410414 # Should use all available examples
411415 assert len (task_runs ) == 5
0 commit comments