Skip to content

Commit 462de97

Browse files
authored
Fix CI: Install CUDA toolkit for flashinfer JIT compilation (#748)
1 parent 1ad7e7c commit 462de97

File tree

2 files changed

+11
-0
lines changed

2 files changed

+11
-0
lines changed

.github/workflows/integration_test.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,14 @@ jobs:
3333
python-version: '3.12'
3434
- name: Update pip
3535
run: python -m pip install --upgrade pip
36+
- name: Install CUDA toolkit
37+
run: |
38+
# flashinfer (used by vLLM 0.13.0) requires nvcc for JIT compilation
39+
# Add NVIDIA CUDA repository for Amazon Linux / RHEL
40+
sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
41+
sudo dnf install -y cuda-toolkit-12-8
42+
echo "CUDA_HOME=/usr/local/cuda-12.8" >> $GITHUB_ENV
43+
echo "/usr/local/cuda-12.8/bin" >> $GITHUB_PATH
3644
- name: Install torchforge
3745
run: pip install uv && uv pip install . && uv pip install .[dev]
3846
- name: Run weight sync integration test

tests/integration_tests/fixtures/qwen3_1_7b_tp.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ generator:
1717
tensor_parallel_size: 4
1818
pipeline_parallel_size: 1
1919
enforce_eager: ${not:${compile}}
20+
# Reduce memory usage for vLLM 0.13.0 warmup on T4 GPUs
21+
max_num_seqs: 128
22+
gpu_memory_utilization: 0.85
2023
sampling_params:
2124
n: ${group_size}
2225
max_tokens: ${max_res_tokens}

0 commit comments

Comments
 (0)