2424 pr_number :
2525 required : false
2626 type : string
27+ max_num_runners :
28+ required : false
29+ type : number
30+ default : 4
2731 outputs :
2832 is_check_failures_ok :
2933 description : " Whether the failure checking infrastructure succeeded"
4347
4448
4549jobs :
50+ setup_check_new_failures :
51+ name : " Setup matrix for finding commits"
52+ runs-on : ubuntu-22.04
53+ outputs :
54+ matrix : ${{ steps.set-matrix.outputs.matrix }}
55+ n_runners : ${{ steps.set-matrix.outputs.n_runners }}
56+ process : ${{ steps.set-matrix.outputs.process }}
57+ steps :
58+ - uses : actions/download-artifact@v4
59+ continue-on-error : true
60+ with :
61+ name : ci_results_${{ inputs.job }}
62+ path : ci_results_${{ inputs.job }}
63+
64+ - name : Set matrix
65+ id : set-matrix
66+ env :
67+ job : ${{ inputs.job }}
68+ max_num_runners : ${{ inputs.max_num_runners }}
69+ run : |
70+ python3 - << 'EOF'
71+ import json, os, math
72+
73+ print("Script started")
74+
75+ job = os.environ["job"]
76+ filepath = f"ci_results_{job}/new_failures.json"
77+
78+ print(f"Looking for file: {filepath}")
79+ print(f"File exists: {os.path.isfile(filepath)}")
80+
81+ if not os.path.isfile(filepath):
82+ print("File not found, setting process=false")
83+ with open(os.environ["GITHUB_OUTPUT"], "a") as f:
84+ f.write("process=false\n")
85+ exit(0)
86+
87+ with open(filepath) as f:
88+ reports = json.load(f)
89+
90+ print(f"Loaded reports with {len(reports)} models")
91+
92+ n_tests = sum(
93+ len(model_data.get("failures", model_data).get("single-gpu", []))
94+ for model_data in reports.values()
95+ )
96+
97+ print(f"n_tests: {n_tests}")
98+
99+ max_num_runners = int(os.environ["max_num_runners"])
100+
101+ TESTS_PER_RUNNER = 10
102+ n_runners = max(1, min(max_num_runners, math.ceil(n_tests / TESTS_PER_RUNNER)))
103+
104+ print(f"n_runners: {n_runners}")
105+
106+ with open(os.environ["GITHUB_OUTPUT"], "a") as f:
107+ f.write(f"matrix={json.dumps(list(range(n_runners)))}\n")
108+ f.write(f"n_runners={n_runners}\n")
109+ f.write("process=true\n")
110+
111+ print("Done")
112+ EOF
113+
114+
46115 check_new_failures :
47116 name : " Find commits for new failing tests"
117+ needs : setup_check_new_failures
118+ if : needs.setup_check_new_failures.outputs.process == 'true'
48119 strategy :
49120 matrix :
50- run_idx : [1]
121+ run_idx : ${{ fromJson(needs.setup_check_new_failures.outputs.matrix) }}
51122 runs-on :
52123 group : aws-g5-4xlarge-cache
53124 outputs :
54- process : ${{ steps.check_file .outputs.process }}
125+ process : ${{ needs.setup_check_new_failures .outputs.process }}
55126 container :
56127 image : ${{ inputs.docker }}
57128 options : --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -61,31 +132,13 @@ jobs:
61132 name : ci_results_${{ inputs.job }}
62133 path : /transformers/ci_results_${{ inputs.job }}
63134
64- - name : Check file
65- id : check_file
66- working-directory : /transformers
67- env :
68- job : ${{ inputs.job }}
69- run : |
70- if [ -f "ci_results_${job}/new_failures.json" ]; then
71- echo "\`ci_results_${job}/new_failures.json\` exists, continue ..."
72- echo "process=true" >> $GITHUB_ENV
73- echo "process=true" >> $GITHUB_OUTPUT
74- else
75- echo "\`ci_results_${job}/new_failures.json\` doesn't exist, abort."
76- echo "process=false" >> $GITHUB_ENV
77- echo "process=false" >> $GITHUB_OUTPUT
78- fi
79-
80135 - uses : actions/download-artifact@v4
81- if : ${{ env.process == 'true' }}
82136 with :
83137 pattern : setup_values*
84138 path : setup_values
85139 merge-multiple : true
86140
87141 - name : Prepare some setup values
88- if : ${{ env.process == 'true' }}
89142 run : |
90143 if [ -f setup_values/prev_workflow_run_id.txt ]; then
91144 echo "PREV_WORKFLOW_RUN_ID=$(cat setup_values/prev_workflow_run_id.txt)" >> $GITHUB_ENV
@@ -95,15 +148,13 @@ jobs:
95148
96149 - name : Update clone
97150 working-directory : /transformers
98- if : ${{ env.process == 'true' }}
99151 env :
100152 commit_sha : ${{ inputs.commit_sha || github.sha }}
101153 run : |
102154 git fetch origin "$commit_sha" && git checkout "$commit_sha"
103155
104156 - name : Get `START_SHA`
105157 working-directory : /transformers/utils
106- if : ${{ env.process == 'true' }}
107158 env :
108159 commit_sha : ${{ inputs.commit_sha || github.sha }}
109160 run : |
@@ -112,7 +163,7 @@ jobs:
112163 # This is used if the CI is triggered from a pull request `self-comment-ci.yml` (after security check is verified)
113164 - name : Extract the base commit on `main` (of the merge commit created by Github) if it is a PR
114165 id : pr_info
115- if : ${{ env.process == 'true' && inputs.pr_number != '' }}
166+ if : ${{ inputs.pr_number != '' }}
116167 uses : actions/github-script@v6
117168 with :
118169 script : |
@@ -134,7 +185,7 @@ jobs:
134185 # (This is why we don't need to specify `workflow_id` which would be fetched automatically in the python script.)
135186 - name : Get `END_SHA` from previous CI runs of the same workflow
136187 working-directory : /transformers/utils
137- if : ${{ env.process == 'true' && inputs.pr_number == '' }}
188+ if : ${{ inputs.pr_number == '' }}
138189 env :
139190 ACCESS_TOKEN : ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
140191 run : |
@@ -144,49 +195,43 @@ jobs:
144195 # parent commit (on `main`) of the `merge_commit` (dynamically created by GitHub). In this case, the goal is to
145196 # see if a reported failing test is actually ONLY failing on the `merge_commit`.
146197 - name : Set `END_SHA`
147- if : ${{ env.process == 'true' && inputs.pr_number != '' }}
198+ if : ${{ inputs.pr_number != '' }}
148199 env :
149200 merge_commit_base_sha : ${{ steps.pr_info.outputs.merge_commit_base_sha }}
150201 run : |
151202 echo "END_SHA=$merge_commit_base_sha" >> $GITHUB_ENV
152203
153204 - name : Reinstall transformers in edit mode (remove the one installed during docker image build)
154205 working-directory : /transformers
155- if : ${{ env.process == 'true' }}
156206 run : python3 -m pip uninstall -y transformers && python3 -m pip install -e .
157207
158208 - name : NVIDIA-SMI
159- if : ${{ env.process == 'true' }}
160209 run : |
161210 nvidia-smi
162211
163212 - name : Environment
164213 working-directory : /transformers
165- if : ${{ env.process == 'true' }}
166214 run : |
167215 python3 utils/print_env.py
168216
169217 - name : Install pytest-flakefinder
170- if : ${{ env.process == 'true' }}
171218 run : python3 -m pip install pytest-flakefinder
172219
173220 - name : Show installed libraries and their versions
174221 working-directory : /transformers
175- if : ${{ env.process == 'true' }}
176222 run : pip freeze
177223
178224 - name : Check failed tests
179225 working-directory : /transformers
180- if : ${{ env.process == 'true' }}
181226 env :
182227 job : ${{ inputs.job }}
228+ n_runners : ${{ needs.setup_check_new_failures.outputs.n_runners }}
183229 run_idx : ${{ matrix.run_idx }}
184230 pr_number : ${{ inputs.pr_number }}
185231 run : python3 utils/check_bad_commit.py --start_commit "$START_SHA" --end_commit "$END_SHA" --file "ci_results_${job}/new_failures.json" --output_file "new_failures_with_bad_commit_${job}_${run_idx}.json"
186232
187233 - name : Show results
188234 working-directory : /transformers
189- if : ${{ env.process == 'true' }}
190235 env :
191236 job : ${{ inputs.job }}
192237 run_idx : ${{ matrix.run_idx }}
@@ -237,7 +282,45 @@ jobs:
237282 env :
238283 job : ${{ inputs.job }}
239284 run : |
240- cp "/transformers/new_failures_with_bad_commit_${job}/new_failures_with_bad_commit_${job}_1.json" new_failures_with_bad_commit.json
285+ python3 - << 'EOF'
286+ import json
287+ import glob
288+ import os
289+
290+ job = os.environ["job"]
291+ pattern = f"/transformers/new_failures_with_bad_commit_{job}/new_failures_with_bad_commit_{job}_*.json"
292+ files = sorted(glob.glob(pattern))
293+
294+ if not files:
295+ print(f"No files found matching: {pattern}")
296+ exit(1)
297+
298+ print(f"Found {len(files)} file(s) to merge: {files}")
299+
300+ merged = {}
301+ for filepath in files:
302+ with open(filepath) as f:
303+ data = json.load(f)
304+
305+ for model, model_results in data.items():
306+ if model not in merged:
307+ merged[model] = {}
308+ for gpu_type, failures in model_results.items():
309+ if gpu_type not in merged[model]:
310+ merged[model][gpu_type] = []
311+ merged[model][gpu_type].extend(failures)
312+
313+ print(f"filepath: {filepath}")
314+ print(len(data))
315+
316+ output_path = "/transformers/new_failures_with_bad_commit.json"
317+ with open(output_path, "w") as f:
318+ json.dump(merged, f, indent=4)
319+
320+ print(f"Merged {len(files)} file(s) into {output_path}")
321+ print(f"n_items: {len(merged)}")
322+ print(merged)
323+ EOF
241324
242325 - name : Update clone
243326 working-directory : /transformers
0 commit comments