Skip to content

Commit e3f7cc3

Browse files
authored
Use multi runners to check new failing tests in a CI run (#45032)
* multi runners * multi runners --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
1 parent 882ffdb commit e3f7cc3

File tree

2 files changed

+163
-67
lines changed

2 files changed

+163
-67
lines changed

.github/workflows/check_failed_tests.yml

Lines changed: 116 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@ on:
2424
pr_number:
2525
required: false
2626
type: string
27+
max_num_runners:
28+
required: false
29+
type: number
30+
default: 4
2731
outputs:
2832
is_check_failures_ok:
2933
description: "Whether the failure checking infrastructure succeeded"
@@ -43,15 +47,82 @@ env:
4347

4448

4549
jobs:
50+
setup_check_new_failures:
51+
name: "Setup matrix for finding commits"
52+
runs-on: ubuntu-22.04
53+
outputs:
54+
matrix: ${{ steps.set-matrix.outputs.matrix }}
55+
n_runners: ${{ steps.set-matrix.outputs.n_runners }}
56+
process: ${{ steps.set-matrix.outputs.process }}
57+
steps:
58+
- uses: actions/download-artifact@v4
59+
continue-on-error: true
60+
with:
61+
name: ci_results_${{ inputs.job }}
62+
path: ci_results_${{ inputs.job }}
63+
64+
- name: Set matrix
65+
id: set-matrix
66+
env:
67+
job: ${{ inputs.job }}
68+
max_num_runners: ${{ inputs.max_num_runners }}
69+
run: |
70+
python3 - << 'EOF'
71+
import json, os, math
72+
73+
print("Script started")
74+
75+
job = os.environ["job"]
76+
filepath = f"ci_results_{job}/new_failures.json"
77+
78+
print(f"Looking for file: {filepath}")
79+
print(f"File exists: {os.path.isfile(filepath)}")
80+
81+
if not os.path.isfile(filepath):
82+
print("File not found, setting process=false")
83+
with open(os.environ["GITHUB_OUTPUT"], "a") as f:
84+
f.write("process=false\n")
85+
exit(0)
86+
87+
with open(filepath) as f:
88+
reports = json.load(f)
89+
90+
print(f"Loaded reports with {len(reports)} models")
91+
92+
n_tests = sum(
93+
len(model_data.get("failures", model_data).get("single-gpu", []))
94+
for model_data in reports.values()
95+
)
96+
97+
print(f"n_tests: {n_tests}")
98+
99+
max_num_runners = int(os.environ["max_num_runners"])
100+
101+
TESTS_PER_RUNNER = 10
102+
n_runners = max(1, min(max_num_runners, math.ceil(n_tests / TESTS_PER_RUNNER)))
103+
104+
print(f"n_runners: {n_runners}")
105+
106+
with open(os.environ["GITHUB_OUTPUT"], "a") as f:
107+
f.write(f"matrix={json.dumps(list(range(n_runners)))}\n")
108+
f.write(f"n_runners={n_runners}\n")
109+
f.write("process=true\n")
110+
111+
print("Done")
112+
EOF
113+
114+
46115
check_new_failures:
47116
name: "Find commits for new failing tests"
117+
needs: setup_check_new_failures
118+
if: needs.setup_check_new_failures.outputs.process == 'true'
48119
strategy:
49120
matrix:
50-
run_idx: [1]
121+
run_idx: ${{ fromJson(needs.setup_check_new_failures.outputs.matrix) }}
51122
runs-on:
52123
group: aws-g5-4xlarge-cache
53124
outputs:
54-
process: ${{ steps.check_file.outputs.process }}
125+
process: ${{ needs.setup_check_new_failures.outputs.process }}
55126
container:
56127
image: ${{ inputs.docker }}
57128
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -61,31 +132,13 @@ jobs:
61132
name: ci_results_${{ inputs.job }}
62133
path: /transformers/ci_results_${{ inputs.job }}
63134

64-
- name: Check file
65-
id: check_file
66-
working-directory: /transformers
67-
env:
68-
job: ${{ inputs.job }}
69-
run: |
70-
if [ -f "ci_results_${job}/new_failures.json" ]; then
71-
echo "\`ci_results_${job}/new_failures.json\` exists, continue ..."
72-
echo "process=true" >> $GITHUB_ENV
73-
echo "process=true" >> $GITHUB_OUTPUT
74-
else
75-
echo "\`ci_results_${job}/new_failures.json\` doesn't exist, abort."
76-
echo "process=false" >> $GITHUB_ENV
77-
echo "process=false" >> $GITHUB_OUTPUT
78-
fi
79-
80135
- uses: actions/download-artifact@v4
81-
if: ${{ env.process == 'true' }}
82136
with:
83137
pattern: setup_values*
84138
path: setup_values
85139
merge-multiple: true
86140

87141
- name: Prepare some setup values
88-
if: ${{ env.process == 'true' }}
89142
run: |
90143
if [ -f setup_values/prev_workflow_run_id.txt ]; then
91144
echo "PREV_WORKFLOW_RUN_ID=$(cat setup_values/prev_workflow_run_id.txt)" >> $GITHUB_ENV
@@ -95,15 +148,13 @@ jobs:
95148
96149
- name: Update clone
97150
working-directory: /transformers
98-
if: ${{ env.process == 'true' }}
99151
env:
100152
commit_sha: ${{ inputs.commit_sha || github.sha }}
101153
run: |
102154
git fetch origin "$commit_sha" && git checkout "$commit_sha"
103155
104156
- name: Get `START_SHA`
105157
working-directory: /transformers/utils
106-
if: ${{ env.process == 'true' }}
107158
env:
108159
commit_sha: ${{ inputs.commit_sha || github.sha }}
109160
run: |
@@ -112,7 +163,7 @@ jobs:
112163
# This is used if the CI is triggered from a pull request `self-comment-ci.yml` (after security check is verified)
113164
- name: Extract the base commit on `main` (of the merge commit created by Github) if it is a PR
114165
id: pr_info
115-
if: ${{ env.process == 'true' && inputs.pr_number != '' }}
166+
if: ${{ inputs.pr_number != '' }}
116167
uses: actions/github-script@v6
117168
with:
118169
script: |
@@ -134,7 +185,7 @@ jobs:
134185
# (This is why we don't need to specify `workflow_id` which would be fetched automatically in the python script.)
135186
- name: Get `END_SHA` from previous CI runs of the same workflow
136187
working-directory: /transformers/utils
137-
if: ${{ env.process == 'true' && inputs.pr_number == '' }}
188+
if: ${{ inputs.pr_number == '' }}
138189
env:
139190
ACCESS_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
140191
run: |
@@ -144,49 +195,43 @@ jobs:
144195
# parent commit (on `main`) of the `merge_commit` (dynamically created by GitHub). In this case, the goal is to
145196
# see if a reported failing test is actually ONLY failing on the `merge_commit`.
146197
- name: Set `END_SHA`
147-
if: ${{ env.process == 'true' && inputs.pr_number != '' }}
198+
if: ${{ inputs.pr_number != '' }}
148199
env:
149200
merge_commit_base_sha: ${{ steps.pr_info.outputs.merge_commit_base_sha }}
150201
run: |
151202
echo "END_SHA=$merge_commit_base_sha" >> $GITHUB_ENV
152203
153204
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
154205
working-directory: /transformers
155-
if: ${{ env.process == 'true' }}
156206
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
157207

158208
- name: NVIDIA-SMI
159-
if: ${{ env.process == 'true' }}
160209
run: |
161210
nvidia-smi
162211
163212
- name: Environment
164213
working-directory: /transformers
165-
if: ${{ env.process == 'true' }}
166214
run: |
167215
python3 utils/print_env.py
168216
169217
- name: Install pytest-flakefinder
170-
if: ${{ env.process == 'true' }}
171218
run: python3 -m pip install pytest-flakefinder
172219

173220
- name: Show installed libraries and their versions
174221
working-directory: /transformers
175-
if: ${{ env.process == 'true' }}
176222
run: pip freeze
177223

178224
- name: Check failed tests
179225
working-directory: /transformers
180-
if: ${{ env.process == 'true' }}
181226
env:
182227
job: ${{ inputs.job }}
228+
n_runners: ${{ needs.setup_check_new_failures.outputs.n_runners }}
183229
run_idx: ${{ matrix.run_idx }}
184230
pr_number: ${{ inputs.pr_number }}
185231
run: python3 utils/check_bad_commit.py --start_commit "$START_SHA" --end_commit "$END_SHA" --file "ci_results_${job}/new_failures.json" --output_file "new_failures_with_bad_commit_${job}_${run_idx}.json"
186232

187233
- name: Show results
188234
working-directory: /transformers
189-
if: ${{ env.process == 'true' }}
190235
env:
191236
job: ${{ inputs.job }}
192237
run_idx: ${{ matrix.run_idx }}
@@ -237,7 +282,45 @@ jobs:
237282
env:
238283
job: ${{ inputs.job }}
239284
run: |
240-
cp "/transformers/new_failures_with_bad_commit_${job}/new_failures_with_bad_commit_${job}_1.json" new_failures_with_bad_commit.json
285+
python3 - << 'EOF'
286+
import json
287+
import glob
288+
import os
289+
290+
job = os.environ["job"]
291+
pattern = f"/transformers/new_failures_with_bad_commit_{job}/new_failures_with_bad_commit_{job}_*.json"
292+
files = sorted(glob.glob(pattern))
293+
294+
if not files:
295+
print(f"No files found matching: {pattern}")
296+
exit(1)
297+
298+
print(f"Found {len(files)} file(s) to merge: {files}")
299+
300+
merged = {}
301+
for filepath in files:
302+
with open(filepath) as f:
303+
data = json.load(f)
304+
305+
for model, model_results in data.items():
306+
if model not in merged:
307+
merged[model] = {}
308+
for gpu_type, failures in model_results.items():
309+
if gpu_type not in merged[model]:
310+
merged[model][gpu_type] = []
311+
merged[model][gpu_type].extend(failures)
312+
313+
print(f"filepath: {filepath}")
314+
print(len(data))
315+
316+
output_path = "/transformers/new_failures_with_bad_commit.json"
317+
with open(output_path, "w") as f:
318+
json.dump(merged, f, indent=4)
319+
320+
print(f"Merged {len(files)} file(s) into {output_path}")
321+
print(f"n_items: {len(merged)}")
322+
print(merged)
323+
EOF
241324
242325
- name: Update clone
243326
working-directory: /transformers

utils/check_bad_commit.py

Lines changed: 47 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import os
2020
import re
2121
import subprocess
22+
from collections import defaultdict
2223

2324
import git
2425
import requests
@@ -314,6 +315,9 @@ def get_commit_info(commit, pr_number=None):
314315
parser.add_argument("--output_file", type=str, required=True, help="The path of the output file.")
315316
args = parser.parse_args()
316317

318+
run_idx = os.environ.get("run_idx")
319+
n_runners = os.environ.get("n_runners")
320+
317321
print(f"start_commit: {args.start_commit}")
318322
print(f"end_commit: {args.end_commit}")
319323

@@ -336,6 +340,8 @@ def get_commit_info(commit, pr_number=None):
336340
with open(args.file, "r", encoding="UTF-8") as fp:
337341
reports = json.load(fp)
338342

343+
model_with_failures = []
344+
339345
for model in reports:
340346
# We change the format of "new_failures.json" in PR #XXXXX, let's handle both formats for a few weeks.
341347
if "failures" in reports[model]:
@@ -351,42 +357,49 @@ def get_commit_info(commit, pr_number=None):
351357
reports[model].pop("multi-gpu", None)
352358
failed_tests = reports[model].get("single-gpu", [])
353359

354-
failed_tests_with_bad_commits = []
355-
for failure in failed_tests:
356-
test = failure["line"]
357-
bad_commit_info = find_bad_commit(
358-
target_test=test, start_commit=args.start_commit, end_commit=args.end_commit
359-
)
360-
info = {"test": test}
361-
info.update(bad_commit_info)
362-
363-
bad_commit = bad_commit_info["bad_commit"]
364-
365-
if bad_commit in commit_info_cache:
366-
commit_info = commit_info_cache[bad_commit]
367-
else:
368-
commit_info = get_commit_info(bad_commit)
369-
commit_info_cache[bad_commit] = commit_info
370-
371-
commit_info_copied = copy.deepcopy(commit_info)
372-
commit_info_copied.pop("commit")
373-
commit_info_copied.update({"workflow_commit": args.start_commit, "base_commit": args.end_commit})
374-
info.update(commit_info_copied)
375-
# put failure message toward the end
376-
info = {k: v for k, v in info.items() if not k.startswith(("failure_at_", "job_link"))} | {
377-
k: v for k, v in info.items() if k.startswith(("failure_at_", "job_link"))
378-
}
379-
380-
failed_tests_with_bad_commits.append(info)
381-
382-
# If no single-gpu test failures, remove the key
383-
if len(failed_tests_with_bad_commits) > 0:
384-
reports[model]["single-gpu"] = failed_tests_with_bad_commits
360+
model_with_failures.extend([(model, test) for test in failed_tests])
361+
362+
if run_idx is not None:
363+
run_idx = int(run_idx)
364+
n_runners = int(n_runners)
365+
366+
num_failed_tests_to_run = len(model_with_failures) // n_runners
367+
368+
start_idx = num_failed_tests_to_run * run_idx
369+
end_idx = num_failed_tests_to_run * (run_idx + 1) if run_idx < n_runners - 1 else len(model_with_failures)
370+
371+
model_with_failures_to_check = model_with_failures[start_idx:end_idx]
372+
model_with_failures = model_with_failures_to_check
373+
374+
failed_tests_with_bad_commits = defaultdict(list)
375+
for model, failure in model_with_failures:
376+
test = failure["line"]
377+
bad_commit_info = find_bad_commit(
378+
target_test=test, start_commit=args.start_commit, end_commit=args.end_commit
379+
)
380+
info = {"test": test}
381+
info.update(bad_commit_info)
382+
383+
bad_commit = bad_commit_info["bad_commit"]
384+
385+
if bad_commit in commit_info_cache:
386+
commit_info = commit_info_cache[bad_commit]
385387
else:
386-
reports[model].pop("single-gpu", None)
388+
commit_info = get_commit_info(bad_commit)
389+
commit_info_cache[bad_commit] = commit_info
390+
391+
commit_info_copied = copy.deepcopy(commit_info)
392+
commit_info_copied.pop("commit")
393+
commit_info_copied.update({"workflow_commit": args.start_commit, "base_commit": args.end_commit})
394+
info.update(commit_info_copied)
395+
# put failure message toward the end
396+
info = {k: v for k, v in info.items() if not k.startswith(("failure_at_", "job_link"))} | {
397+
k: v for k, v in info.items() if k.startswith(("failure_at_", "job_link"))
398+
}
399+
400+
failed_tests_with_bad_commits[model].append(info)
387401

388-
# remove the models without any test failure
389-
reports = {k: v for k, v in reports.items() if len(v) > 0}
402+
reports = {model: {"single-gpu": tests} for model, tests in failed_tests_with_bad_commits.items() if tests}
390403

391404
with open(args.output_file, "w", encoding="UTF-8") as fp:
392405
json.dump(reports, fp, ensure_ascii=False, indent=4)

0 commit comments

Comments
 (0)