-
Notifications
You must be signed in to change notification settings - Fork 137
334 lines (290 loc) · 12.7 KB
/
test.yml
File metadata and controls
334 lines (290 loc) · 12.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
name: Tests
on:
pull_request:
push:
branches:
- main
- release/*
concurrency:
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true
jobs:
load-matrix:
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Load matrix from file
id: set-matrix
run: |
matrix=$(cat .github/matrix.json | jq -c .)
echo "matrix=$matrix" >> $GITHUB_OUTPUT
test:
needs: load-matrix
strategy:
fail-fast: false
matrix: ${{ fromJSON(needs.load-matrix.outputs.matrix) }}
name: test-${{ matrix.runtime-version }}-py${{ matrix.python-version }}-${{ matrix.pytorch-version }}-${{ matrix.backend }}-${{ matrix.alias }}
container: ${{ matrix.image != '' && fromJSON(format('{{"image":"{0}","options":"{1}"}}', matrix.image, matrix.container-options)) || '' }}
runs-on: ${{ matrix.runner }}
defaults:
run:
shell: bash -l {0}
steps:
- name: Run NVIDIA command
if: startsWith(matrix.image, 'nvidia')
run: |
echo "Detected NVIDIA image"
nvidia-smi || echo "nvidia-smi not found"
- name: Run ROCm command
if: startsWith(matrix.image, 'rocm')
run: |
echo "Detected ROCm image"
rocminfo || echo "rocminfo not found"
- name: Check out code
uses: actions/checkout@v6
- name: Install system dependencies
run: |
set -eux
SUDO=$(command -v sudo 2>/dev/null || true)
$SUDO apt-get update
$SUDO apt-get install -y libdw1 curl wget git pkg-config zlib1g-dev build-essential
- name: Install NVSHMEM
if: contains(matrix.alias, 'distributed')
run: |
set -euxo pipefail
GPU_COUNT=$(nvidia-smi -L | wc -l)
if [ "$GPU_COUNT" -ne 4 ]; then
echo "Error: Expected 4 GPUs but found $GPU_COUNT"
exit 1
fi
curl -L https://raw.githubusercontent.com/pytorch/pytorch/main/.ci/docker/common/install_cuda.sh -o install_cuda.sh
chmod +x install_cuda.sh
source install_cuda.sh
install_nvshmem 13 3.4.5
- name: Install uv
uses: astral-sh/setup-uv@v7
with:
python-version: ${{ matrix.python-version }}
enable-cache: true
- name: Create virtual environment
run: |
uv venv --python ${{ matrix.python-version }}
- name: Get current month
id: date
run: echo "month=$(date +'%Y-%m')" >> $GITHUB_OUTPUT
- name: Cache dependencies
id: cache
uses: actions/cache@v5
with:
path: |
~/.cache/uv
~/.venv
key: ${{ matrix.python-version }}-${{ matrix.runtime-version }}-${{ matrix.pytorch-version }}-${{ hashFiles('.github/workflows/test.yml') }}-${{ steps.date.outputs.month }}
- name: Install PyTorch
run: |
source .venv/bin/activate
if [[ "${{ matrix.pytorch-version }}" == "pytorch-2.11" ]]; then
# Install stable 2.11 release
uv pip install -U "torch==2.11.*" --index-url https://download.pytorch.org/whl/${{ matrix.runtime-version }}
elif [[ "${{ matrix.runtime-version }}" == "tpu" ]]; then
# TPU: install CPU-only PyTorch nightly (torch_tpu provides TPU backend)
uv pip install -U --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
else
# Default to nightly
if [[ "${{ matrix.runtime-version }}" == "cu128" ]]; then
# Install nvidia-nvshmem-cu12 from cu129 index (missing on cu128)
uv pip install -U --pre nvidia-nvshmem-cu12 --index-url https://download.pytorch.org/whl/nightly/cu129
fi
uv pip install -U --pre torch --index-url https://download.pytorch.org/whl/nightly/${{ matrix.runtime-version }}
fi
- name: Install Triton
if: matrix.backend == 'tileir' || (matrix.backend == 'triton' && steps.cache.outputs.cache-hit != 'true' && matrix.pytorch-version != 'pytorch-2.11')
run: |
set -x
source .venv/bin/activate
apt-get update
apt-get install -y git
apt-get install -y clang-20 clang++-20 zlib1g-dev
export CC=clang-20
export CXX=clang++-20
mkdir -p /tmp/$USER
cd /tmp/$USER
uv pip uninstall triton pytorch-triton || true
rm -rf triton/ || true
if [[ "${{ matrix.backend }}" == "tileir" ]]; then
git clone --recursive -b main https://github.com/triton-lang/Triton-to-tile-IR.git triton
else
git clone https://github.com/triton-lang/triton.git triton
if [[ "${{ matrix.python-version }}" == "3.14" ]]; then
# Pin Python 3.14 nightly to known-good Triton revision until backend detection is fixed upstream.
git -C triton checkout 77a13369
else
git -C triton checkout 9844da95
fi
fi
cd triton/
uv pip install -r python/requirements.txt
MAX_JOBS=$(nproc) TRITON_PARALLEL_LINK_JOBS=2 uv pip install .
cd /tmp/$USER
rm -rf triton/
python -c "import triton; print(f'Triton version: {triton.__version__}')"
- name: Pin networkx for Python 3.14
if: matrix.python-version == '3.14'
run: |
source .venv/bin/activate
uv pip install networkx==2.8.8
- name: Install Helion
run: |
source .venv/bin/activate
uv pip install setuptools ninja
SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0" uv pip install -e .'[dev]'
python -c "import helion; print(helion.__name__)"
- name: Install TPU dependencies (Pallas)
if: matrix.alias == 'tpu'
run: |
set -euxo pipefail
source .venv/bin/activate
uv pip install \
--extra-index-url https://us-python.pkg.dev/ml-oss-artifacts-published/jax/simple/ \
--find-links https://storage.googleapis.com/jax-releases/libtpu_releases.html \
--pre \
'jax==0.9.2' 'jaxlib==0.9.2' 'libtpu==0.0.37' 'tpu-info==0.7.1' 'jaxtyping' 'frozendict'
# Install Bazel
if ! command -v bazel &> /dev/null; then
sudo curl -L https://github.com/bazelbuild/bazelisk/releases/download/v1.27.0/bazelisk-linux-amd64 -o /usr/local/bin/bazel
sudo chmod +x /usr/local/bin/bazel
fi
# Install gcloud CLI if not present (needed for Secret Manager)
if ! command -v gcloud &> /dev/null; then
sudo apt-get install -y apt-transport-https ca-certificates gnupg curl
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | sudo tee /etc/apt/sources.list.d/google-cloud-sdk.list
sudo apt-get update && sudo apt-get install -y google-cloud-cli
fi
# Clone torch_tpu via GCP Secret Manager SSH key (same as pytorch CI)
TORCH_TPU_COMMIT=$(cat .github/ci_commit_pins/torch_tpu.txt)
set +x
gcloud secrets versions access latest \
--secret="torchtpu-read-key" \
--project="ml-velocity-actions-testing" > /tmp/torch_tpu_ssh_key
set -x
chmod 600 /tmp/torch_tpu_ssh_key
GIT_SSH_COMMAND="ssh -i /tmp/torch_tpu_ssh_key -o IdentitiesOnly=yes -o StrictHostKeyChecking=no" \
git clone [email protected]:google-pytorch/torch_tpu.git /tmp/torch_tpu
rm -f /tmp/torch_tpu_ssh_key
cd /tmp/torch_tpu
git checkout "${TORCH_TPU_COMMIT}"
# Build torch_tpu wheel
export TORCH_SOURCE=$(python -c "import torch; import os; print(os.path.dirname(os.path.dirname(torch.__file__)))")
export SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
bazel build -c opt //ci/wheel:torch_tpu_wheel --config=helion_public_caching_readwrite --define WHEEL_VERSION=0.1.0 --define TORCH_SOURCE=local --action_env=PYTHONPATH=$TORCH_SOURCE:$SITE_PACKAGES --action_env=JAX_PLATFORMS=cpu
uv pip install bazel-bin/ci/wheel/*.whl
cd -
rm -rf /tmp/torch_tpu
# Verify
python -c "from torch_tpu import api; print(f'TPU device: {api.tpu_device()}')"
- name: Install CUTLASS CuTe DSL
if: matrix.backend == 'cute'
run: |
source .venv/bin/activate
SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0" uv pip install .'[cute-cu12]'
- name: CUDA Compute Check
if: startsWith(matrix.image, 'nvidia')
run: |
source .venv/bin/activate
python -c "
import torch, sys
assert torch.cuda.is_available(), 'FATAL: CUDA not available'
n = torch.cuda.device_count()
assert n > 0, 'FATAL: No CUDA devices found'
print(f'CUDA devices: {n}')
for i in range(n):
dev = torch.device('cuda', i)
a = torch.randn(256, 256, device=dev)
b = (a @ a).sum().item()
print(f' Device {i} ({torch.cuda.get_device_name(i)}): OK')
print(f'All {n} devices healthy')
"
- name: Run Tests
run: |
set -o pipefail
source .venv/bin/activate
uv pip install pytest-xdist
# Conditionally enable ref-eager and golden-accept/dtype-assert test modes
if [[ "${{ matrix.dtype-asserts }}" == "true" ]]; then export HELION_DEBUG_DTYPE_ASSERTS=1; fi
if [[ "${{ matrix.expecttest-accept }}" == "true" ]]; then export EXPECTTEST_ACCEPT=1; fi
if [[ "${{ matrix.ref-eager }}" == "true" ]]; then export HELION_INTERPRET=1; fi
if [[ "${{ matrix.backend }}" == "tileir" ]]; then export ENABLE_TILE=1; fi
export HELION_BACKEND=${{ matrix.backend }}
# -rf: print failed tests
# --timeout: max allowed time for each test
PARALLEL="-n4"
if [[ "${{ contains(matrix.alias, 'distributed') }}" == "true" ]]; then
TEST_PATH="test/test_examples_dist.py"
EXTRA_FLAGS="-rs"
elif [[ "${{ matrix.alias }}" == "tpu" ]]; then
TEST_PATH="."
EXTRA_FLAGS="--ignore=test/test_examples_dist.py"
PARALLEL=""
else
TEST_PATH="."
EXTRA_FLAGS="--ignore=test/test_examples_dist.py"
fi
# For distributed tests, fail if any test is skipped, failed, or has an error
SKIP_CHECK=$([[ "${{ contains(matrix.alias, 'distributed') }}" == "true" ]] && echo "! grep -qE '(SKIPPED|FAILED|ERROR)'" || echo "cat > /dev/null")
pytest $PARALLEL -rf --timeout=60 $EXTRA_FLAGS $TEST_PATH | tee >(eval $SKIP_CHECK)
test-notebooks:
name: test-notebooks-cu128-py3.12-pytorch-2.11-a10g
container:
image: nvidia/cuda:12.8.1-devel-ubuntu24.04
options: --gpus all
runs-on: linux.g5.4xlarge.nvidia.gpu
defaults:
run:
shell: bash -l {0}
steps:
- name: Run NVIDIA command
run: |
echo "Detected NVIDIA image"
nvidia-smi || echo "nvidia-smi not found"
- name: Check out code
uses: actions/checkout@v6
- name: Install uv
uses: astral-sh/setup-uv@v7
with:
python-version: "3.12"
enable-cache: true
- name: Create virtual environment
run: |
uv venv --python 3.12
- name: Install pip in venv
run: |
source .venv/bin/activate
uv pip install pip
- name: Get current month
id: date
run: echo "month=$(date +'%Y-%m')" >> $GITHUB_OUTPUT
- name: Cache dependencies
id: cache
uses: actions/cache@v5
with:
path: |
~/.cache/uv
~/.venv
key: notebooks-3.12-cu128-${{ hashFiles('.github/workflows/test.yml') }}-${{ steps.date.outputs.month }}
- name: Install notebook execution tools
run: |
source .venv/bin/activate
# Install jupyter for executing notebooks
uv pip install jupyter nbconvert pytest numpy "nbclient<0.10"
- name: Run Notebook Tests
run: |
source .venv/bin/activate
# Execute notebook using jupyter nbconvert
# The notebook's subprocess pip install will install torch and helion
jupyter nbconvert --to notebook --execute --inplace \
--ExecutePreprocessor.timeout=600 \
notebooks/softmax.ipynb