helion/.github/workflows/test.yml at 6bc95245b886b9fd2c9b3220bdb72963f525e284 · pytorch/helion · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
name: Tests

on:
  pull_request:
  push:
    branches:
      - main
      - release/*

concurrency:
  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
  cancel-in-progress: true

jobs:
  load-matrix:
    runs-on: ubuntu-latest
    outputs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@v6

      - name: Load matrix from file
        id: set-matrix
        run: |
          matrix=$(cat .github/matrix.json | jq -c .)
          echo "matrix=$matrix" >> $GITHUB_OUTPUT

  test:
    needs: load-matrix

    strategy:
      fail-fast: false
      matrix: ${{ fromJSON(needs.load-matrix.outputs.matrix) }}

    name: test-${{ matrix.runtime-version }}-py${{ matrix.python-version }}-${{ matrix.pytorch-version }}-${{ matrix.backend }}-${{ matrix.alias }}

    container: ${{ matrix.image != '' && fromJSON(format('{{"image":"{0}","options":"{1}"}}', matrix.image, matrix.container-options)) || '' }}

    runs-on: ${{ matrix.runner }}


    defaults:
      run:
        shell: bash -l {0}

    steps:
      - name: Run NVIDIA command
        if: startsWith(matrix.image, 'nvidia')
        run: |
          echo "Detected NVIDIA image"
          nvidia-smi || echo "nvidia-smi not found"

      - name: Run ROCm command
        if: startsWith(matrix.image, 'rocm')
        run: |
          echo "Detected ROCm image"
          rocminfo || echo "rocminfo not found"

      - name: Check out code
        uses: actions/checkout@v6

      - name: Install system dependencies
        run: |
          set -eux
          SUDO=$(command -v sudo 2>/dev/null || true)
          $SUDO apt-get update
          $SUDO apt-get install -y libdw1 curl wget git pkg-config zlib1g-dev build-essential

      - name: Install NVSHMEM
        if: contains(matrix.alias, 'distributed')
        run: |
          set -euxo pipefail
          GPU_COUNT=$(nvidia-smi -L | wc -l)
          if [ "$GPU_COUNT" -ne 4 ]; then
            echo "Error: Expected 4 GPUs but found $GPU_COUNT"
            exit 1
          fi
          curl -L https://raw.githubusercontent.com/pytorch/pytorch/main/.ci/docker/common/install_cuda.sh -o install_cuda.sh
          chmod +x install_cuda.sh
          source install_cuda.sh
          install_nvshmem 13 3.4.5

      - name: Install uv
        uses: astral-sh/setup-uv@v7
        with:
          python-version: ${{ matrix.python-version }}
          enable-cache: true

      - name: Create virtual environment
        run: |
          uv venv --python ${{ matrix.python-version }}

      - name: Get current month
        id: date
        run: echo "month=$(date +'%Y-%m')" >> $GITHUB_OUTPUT

      - name: Cache dependencies
        id: cache
        uses: actions/cache@v5
        with:
          path: |
            ~/.cache/uv
            ~/.venv
          key: ${{ matrix.python-version }}-${{ matrix.runtime-version }}-${{ matrix.pytorch-version }}-${{ hashFiles('.github/workflows/test.yml') }}-${{ steps.date.outputs.month }}

      - name: Install PyTorch
        run: |
          source .venv/bin/activate
          if [[ "${{ matrix.pytorch-version }}" == "pytorch-2.11" ]]; then
            # Install stable 2.11 release
            uv pip install -U "torch==2.11.*" --index-url https://download.pytorch.org/whl/${{ matrix.runtime-version }}
          elif [[ "${{ matrix.runtime-version }}" == "tpu" ]]; then
            # TPU: install CPU-only PyTorch nightly (torch_tpu provides TPU backend)
            uv pip install -U --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
          else
            # Default to nightly
            if [[ "${{ matrix.runtime-version }}" == "cu128" ]]; then
              # Install nvidia-nvshmem-cu12 from cu129 index (missing on cu128)
              uv pip install -U --pre nvidia-nvshmem-cu12 --index-url https://download.pytorch.org/whl/nightly/cu129
            fi
            uv pip install -U --pre torch --index-url https://download.pytorch.org/whl/nightly/${{ matrix.runtime-version }}
          fi

      - name: Install Triton
        if: matrix.backend == 'tileir' || (matrix.backend == 'triton' && steps.cache.outputs.cache-hit != 'true' && matrix.pytorch-version != 'pytorch-2.11')
        run: |
          set -x
          source .venv/bin/activate
          apt-get update
          apt-get install -y git
          apt-get install -y clang-20 clang++-20 zlib1g-dev
          export CC=clang-20
          export CXX=clang++-20
          mkdir -p /tmp/$USER
          cd /tmp/$USER
          uv pip uninstall triton pytorch-triton || true
          rm -rf triton/ || true
          if [[ "${{ matrix.backend }}" == "tileir" ]]; then
            git clone --recursive -b main https://github.com/triton-lang/Triton-to-tile-IR.git triton
          else
            git clone https://github.com/triton-lang/triton.git triton
            if [[ "${{ matrix.python-version }}" == "3.14" ]]; then
              # Pin Python 3.14 nightly to known-good Triton revision until backend detection is fixed upstream.
              git -C triton checkout 77a13369
            else
              git -C triton checkout 9844da95
            fi
          fi
          cd triton/
          uv pip install -r python/requirements.txt
          MAX_JOBS=$(nproc) TRITON_PARALLEL_LINK_JOBS=2 uv pip install .
          cd /tmp/$USER
          rm -rf triton/
          python -c "import triton; print(f'Triton version: {triton.__version__}')"

      - name: Pin networkx for Python 3.14
        if: matrix.python-version == '3.14'
        run: |
          source .venv/bin/activate
          uv pip install networkx==2.8.8

      - name: Install Helion
        run: |
          source .venv/bin/activate
          uv pip install setuptools ninja
          SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0" uv pip install -e .'[dev]'
          python -c "import helion; print(helion.__name__)"

      - name: Install TPU dependencies (Pallas)
        if: matrix.alias == 'tpu'
        run: |
          set -euxo pipefail
          source .venv/bin/activate
          uv pip install \
            --extra-index-url https://us-python.pkg.dev/ml-oss-artifacts-published/jax/simple/ \
            --find-links https://storage.googleapis.com/jax-releases/libtpu_releases.html \
            --pre \
            'jax==0.9.2' 'jaxlib==0.9.2' 'libtpu==0.0.37' 'tpu-info==0.7.1' 'jaxtyping' 'frozendict'
          # Install Bazel
          if ! command -v bazel &> /dev/null; then
            sudo curl -L https://github.com/bazelbuild/bazelisk/releases/download/v1.27.0/bazelisk-linux-amd64 -o /usr/local/bin/bazel
            sudo chmod +x /usr/local/bin/bazel
          fi
          # Install gcloud CLI if not present (needed for Secret Manager)
          if ! command -v gcloud &> /dev/null; then
            sudo apt-get install -y apt-transport-https ca-certificates gnupg curl
            curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
            echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | sudo tee /etc/apt/sources.list.d/google-cloud-sdk.list
            sudo apt-get update && sudo apt-get install -y google-cloud-cli
          fi
          # Clone torch_tpu via GCP Secret Manager SSH key (same as pytorch CI)
          TORCH_TPU_COMMIT=$(cat .github/ci_commit_pins/torch_tpu.txt)
          set +x
          gcloud secrets versions access latest \
            --secret="torchtpu-read-key" \
            --project="ml-velocity-actions-testing" > /tmp/torch_tpu_ssh_key
          set -x
          chmod 600 /tmp/torch_tpu_ssh_key
          GIT_SSH_COMMAND="ssh -i /tmp/torch_tpu_ssh_key -o IdentitiesOnly=yes -o StrictHostKeyChecking=no" \
            git clone [email protected]:google-pytorch/torch_tpu.git /tmp/torch_tpu
          rm -f /tmp/torch_tpu_ssh_key
          cd /tmp/torch_tpu
          git checkout "${TORCH_TPU_COMMIT}"
          # Build torch_tpu wheel
          export TORCH_SOURCE=$(python -c "import torch; import os; print(os.path.dirname(os.path.dirname(torch.__file__)))")
          export SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
          bazel build -c opt //ci/wheel:torch_tpu_wheel --config=helion_public_caching_readwrite --define WHEEL_VERSION=0.1.0 --define TORCH_SOURCE=local --action_env=PYTHONPATH=$TORCH_SOURCE:$SITE_PACKAGES --action_env=JAX_PLATFORMS=cpu
          uv pip install bazel-bin/ci/wheel/*.whl
          cd -
          rm -rf /tmp/torch_tpu
          # Verify
          python -c "from torch_tpu import api; print(f'TPU device: {api.tpu_device()}')"

      - name: Install CUTLASS CuTe DSL
        if: matrix.backend == 'cute'
        run: |
          source .venv/bin/activate
          SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0" uv pip install .'[cute-cu12]'

      - name: CUDA Compute Check
        if: startsWith(matrix.image, 'nvidia')
        run: |
          source .venv/bin/activate
          python -c "
          import torch, sys

          assert torch.cuda.is_available(), 'FATAL: CUDA not available'
          n = torch.cuda.device_count()
          assert n > 0, 'FATAL: No CUDA devices found'
          print(f'CUDA devices: {n}')

          for i in range(n):
              dev = torch.device('cuda', i)
              a = torch.randn(256, 256, device=dev)
              b = (a @ a).sum().item()
              print(f'  Device {i} ({torch.cuda.get_device_name(i)}): OK')

          print(f'All {n} devices healthy')
          "

      - name: Run Tests
        run: |
          set -o pipefail
          source .venv/bin/activate
          uv pip install pytest-xdist
          # Conditionally enable ref-eager and golden-accept/dtype-assert test modes
          if [[ "${{ matrix.dtype-asserts }}" == "true" ]]; then export HELION_DEBUG_DTYPE_ASSERTS=1; fi
          if [[ "${{ matrix.expecttest-accept }}" == "true" ]]; then export EXPECTTEST_ACCEPT=1; fi
          if [[ "${{ matrix.ref-eager }}" == "true" ]]; then export HELION_INTERPRET=1; fi
          if [[ "${{ matrix.backend }}" == "tileir" ]]; then export ENABLE_TILE=1; fi
          export HELION_BACKEND=${{ matrix.backend }}
          # -rf: print failed tests
          # --timeout: max allowed time for each test
          PARALLEL="-n4"
          if [[ "${{ contains(matrix.alias, 'distributed') }}" == "true" ]]; then
            TEST_PATH="test/test_examples_dist.py"
            EXTRA_FLAGS="-rs"
          elif [[ "${{ matrix.alias }}" == "tpu" ]]; then
            TEST_PATH="."
            EXTRA_FLAGS="--ignore=test/test_examples_dist.py"
            PARALLEL=""
          else
            TEST_PATH="."
            EXTRA_FLAGS="--ignore=test/test_examples_dist.py"
          fi
          # For distributed tests, fail if any test is skipped, failed, or has an error
          SKIP_CHECK=$([[ "${{ contains(matrix.alias, 'distributed') }}" == "true" ]] && echo "! grep -qE '(SKIPPED|FAILED|ERROR)'" || echo "cat > /dev/null")
          pytest $PARALLEL -rf --timeout=60 $EXTRA_FLAGS $TEST_PATH | tee >(eval $SKIP_CHECK)

  test-notebooks:
    name: test-notebooks-cu128-py3.12-pytorch-2.11-a10g

    container:
      image: nvidia/cuda:12.8.1-devel-ubuntu24.04
      options: --gpus all

    runs-on: linux.g5.4xlarge.nvidia.gpu

    defaults:
      run:
        shell: bash -l {0}

    steps:
      - name: Run NVIDIA command
        run: |
          echo "Detected NVIDIA image"
          nvidia-smi || echo "nvidia-smi not found"

      - name: Check out code
        uses: actions/checkout@v6

      - name: Install uv
        uses: astral-sh/setup-uv@v7
        with:
          python-version: "3.12"
          enable-cache: true

      - name: Create virtual environment
        run: |
          uv venv --python 3.12

      - name: Install pip in venv
        run: |
          source .venv/bin/activate
          uv pip install pip

      - name: Get current month
        id: date
        run: echo "month=$(date +'%Y-%m')" >> $GITHUB_OUTPUT

      - name: Cache dependencies
        id: cache
        uses: actions/cache@v5
        with:
          path: |
            ~/.cache/uv
            ~/.venv
          key: notebooks-3.12-cu128-${{ hashFiles('.github/workflows/test.yml') }}-${{ steps.date.outputs.month }}

      - name: Install notebook execution tools
        run: |
          source .venv/bin/activate
          # Install jupyter for executing notebooks
          uv pip install jupyter nbconvert pytest numpy "nbclient<0.10"

      - name: Run Notebook Tests
        run: |
          source .venv/bin/activate
          # Execute notebook using jupyter nbconvert
          # The notebook's subprocess pip install will install torch and helion
          jupyter nbconvert --to notebook --execute --inplace \
            --ExecutePreprocessor.timeout=600 \
            notebooks/softmax.ipynb