Fix CI: Install CUDA toolkit for flashinfer JIT compilation (#748)

JenniferWang · web-flow · commit 462de977a1c5 · 2026-02-02T12:13:30.000-05:00
diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml
@@ -33,6 +33,14 @@ jobs:
           python-version: '3.12'
       - name: Update pip
         run: python -m pip install --upgrade pip
+      - name: Install CUDA toolkit
+        run: |
+          # flashinfer (used by vLLM 0.13.0) requires nvcc for JIT compilation
+          # Add NVIDIA CUDA repository for Amazon Linux / RHEL
+          sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
+          sudo dnf install -y cuda-toolkit-12-8
+          echo "CUDA_HOME=/usr/local/cuda-12.8" >> $GITHUB_ENV
+          echo "/usr/local/cuda-12.8/bin" >> $GITHUB_PATH
       - name: Install torchforge
         run: pip install uv && uv pip install . && uv pip install .[dev]
       - name: Run weight sync integration test
diff --git a/tests/integration_tests/fixtures/qwen3_1_7b_tp.yaml b/tests/integration_tests/fixtures/qwen3_1_7b_tp.yaml
@@ -17,6 +17,9 @@ generator:
     tensor_parallel_size: 4
     pipeline_parallel_size: 1
     enforce_eager: ${not:${compile}}
+    # Reduce memory usage for vLLM 0.13.0 warmup on T4 GPUs
+    max_num_seqs: 128
+    gpu_memory_utilization: 0.85
   sampling_params:
     n: ${group_size}
     max_tokens: ${max_res_tokens}