Skip to content

Commit 250cdb3

Browse files
authored
[ROCm] Windows workflow for creating wheels with ROCm 7.2.1 support (#1915)
* Add Windows ROCm workflow for building wheels with ROCm 7.2.1 support * Compress binary size with build flags
1 parent 4986b43 commit 250cdb3

9 files changed

Lines changed: 144 additions & 51 deletions

File tree

.github/scripts/build-rocm.sh

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,33 @@ if [ "${build_os:0:6}" == ubuntu ]; then
2121
&& pip install cmake==3.31.6 \
2222
&& cmake -DCOMPUTE_BACKEND=hip -DCMAKE_BUILD_TYPE=MinSizeRel -DCMAKE_HIP_FLAGS=\"--offload-compress\" -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
2323
&& cmake --build ."
24+
else
25+
bnb_rocm_arch="gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201"
26+
27+
pip install ninja cmake==3.31.6
28+
29+
# Install ROCm SDK wheels from repo.radeon.com.
30+
rocm_base_url="https://repo.radeon.com/rocm/windows/rocm-rel-${rocm_version}"
31+
pip install \
32+
"${rocm_base_url}/rocm_sdk_core-${rocm_version}-py3-none-win_amd64.whl" \
33+
"${rocm_base_url}/rocm_sdk_devel-${rocm_version}-py3-none-win_amd64.whl" \
34+
"${rocm_base_url}/rocm_sdk_libraries_custom-${rocm_version}-py3-none-win_amd64.whl" \
35+
"${rocm_base_url}/rocm-${rocm_version}.tar.gz"
36+
37+
# Expand the devel tarball
38+
rocm-sdk init
39+
40+
ROCM_PATH="$(rocm-sdk path --root)"
41+
export ROCM_PATH
42+
export PATH="${ROCM_PATH}/bin:${PATH}"
43+
44+
cmake -G Ninja \
45+
-DCOMPUTE_BACKEND=hip \
46+
-DBNB_ROCM_ARCH="${bnb_rocm_arch}" \
47+
-DCMAKE_BUILD_TYPE=MinSizeRel \
48+
-DCMAKE_HIP_FLAGS="--offload-compress" \
49+
-S .
50+
cmake --build .
2451
fi
2552

2653
output_dir="output/${build_os}/${build_arch}"

.github/workflows/python-package.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,10 +137,15 @@ jobs:
137137
os: [ubuntu-22.04]
138138
arch: [x86_64]
139139
rocm_version: ["6.2.4", "6.3.4", "6.4.4", "7.0.2", "7.1", "7.2.1"]
140+
include:
141+
- os: windows-2025
142+
arch: x86_64
143+
rocm_version: "7.2.1"
140144
runs-on: ${{ matrix.os }}
141145
steps:
142146
- uses: actions/checkout@v4
143147
- name: Clean up disk space
148+
if: startsWith(matrix.os, 'ubuntu')
144149
run: |
145150
echo "Disk space before cleanup:"
146151
df -h
@@ -156,6 +161,9 @@ jobs:
156161
157162
echo "Disk space after cleanup:"
158163
df -h
164+
- name: Setup MSVC
165+
if: startsWith(matrix.os, 'windows')
166+
uses: ilammy/msvc-dev-cmd@v1.13.0
159167
- name: Build C++
160168
run: bash .github/scripts/build-rocm.sh
161169
env:

CMakeLists.txt

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,17 @@ cmake_minimum_required(VERSION 3.22.1)
1919
# On Windows with HIP backend, auto-detect compilers from ROCM_PATH before project()
2020
if(WIN32 AND COMPUTE_BACKEND STREQUAL "hip")
2121
if(DEFINED ENV{ROCM_PATH})
22-
set(ROCM_PATH $ENV{ROCM_PATH})
22+
file(TO_CMAKE_PATH "$ENV{ROCM_PATH}" ROCM_PATH)
2323
endif()
2424
if(ROCM_PATH AND NOT DEFINED CMAKE_CXX_COMPILER)
2525
set(CMAKE_CXX_COMPILER "${ROCM_PATH}/lib/llvm/bin/clang++.exe")
2626
endif()
2727
if(ROCM_PATH AND NOT DEFINED CMAKE_HIP_COMPILER)
2828
set(CMAKE_HIP_COMPILER "${ROCM_PATH}/lib/llvm/bin/clang++.exe")
2929
endif()
30+
if(NOT DEFINED HIP_PLATFORM)
31+
set(HIP_PLATFORM "amd" CACHE STRING "HIP Platform")
32+
endif()
3033
# On Windows, the HIP compiler needs explicit paths to find device libraries.
3134
if(ROCM_PATH)
3235
find_path(ROCM_DEVICE_LIB_PATH
@@ -35,9 +38,9 @@ if(WIN32 AND COMPUTE_BACKEND STREQUAL "hip")
3538
"${ROCM_PATH}/lib/llvm/amdgcn/bitcode"
3639
NO_DEFAULT_PATH
3740
)
38-
set(CMAKE_HIP_FLAGS "--rocm-path=${ROCM_PATH}")
41+
string(APPEND CMAKE_HIP_FLAGS " --rocm-path=${ROCM_PATH}")
3942
if(ROCM_DEVICE_LIB_PATH)
40-
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} --rocm-device-lib-path=${ROCM_DEVICE_LIB_PATH}")
43+
string(APPEND CMAKE_HIP_FLAGS " --rocm-device-lib-path=${ROCM_DEVICE_LIB_PATH}")
4144
endif()
4245
endif()
4346
endif()
@@ -357,7 +360,7 @@ endif()
357360
if(BUILD_HIP)
358361
# Determine ROCM_PATH from environment variable, fallback to /opt/rocm on Linux
359362
if(DEFINED ENV{ROCM_PATH})
360-
set(ROCM_PATH $ENV{ROCM_PATH})
363+
file(TO_CMAKE_PATH "$ENV{ROCM_PATH}" ROCM_PATH)
361364
else()
362365
set(ROCM_PATH /opt/rocm)
363366
endif()
@@ -416,11 +419,15 @@ if(WIN32)
416419
set_target_properties(bitsandbytes PROPERTIES PREFIX "lib")
417420
endif()
418421
set_target_properties(bitsandbytes PROPERTIES OUTPUT_NAME ${BNB_OUTPUT_NAME})
419-
if(MSVC)
420-
set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE "${PROJECT_SOURCE_DIR}/bitsandbytes")
421-
set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes")
422-
set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE "${PROJECT_SOURCE_DIR}/bitsandbytes")
423-
set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes")
422+
if(WIN32)
423+
set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/bitsandbytes")
424+
set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/bitsandbytes")
425+
if(MSVC)
426+
set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE "${PROJECT_SOURCE_DIR}/bitsandbytes")
427+
set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes")
428+
set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE "${PROJECT_SOURCE_DIR}/bitsandbytes")
429+
set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes")
430+
endif()
424431
endif()
425432

426433
set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/bitsandbytes")

README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,18 @@ bitsandbytes has the following minimum requirements for all platforms:
133133
<td>✅</td>
134134
<td>✅</td>
135135
</tr>
136+
<tr>
137+
<td></td>
138+
<td>🟥 AMD GPU <br><code>cuda</code></td>
139+
<td>
140+
RDNA: gfx1100, gfx1101, gfx1102,<br>
141+
gfx1150, gfx1151,<br>
142+
gfx1200, gfx1201
143+
</td>
144+
<td>✅</td>
145+
<td>✅</td>
146+
<td>✅</td>
147+
</tr>
136148
<tr>
137149
<td></td>
138150
<td>🟦 Intel GPU <br><code>xpu</code></td>

bitsandbytes/cuda_specs.py

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -109,38 +109,3 @@ def get_rocm_gpu_arch() -> str:
109109
""",
110110
)
111111
return "unknown"
112-
113-
114-
def get_rocm_warpsize() -> int:
115-
"""Get ROCm warp size."""
116-
logger = logging.getLogger(__name__)
117-
try:
118-
if torch.version.hip:
119-
# On Windows, use hipinfo.exe; on Linux, use rocminfo
120-
if platform.system() == "Windows":
121-
cmd = ["hipinfo.exe"]
122-
# hipinfo.exe output format: "warpSize: 32" or "warpSize: 64"
123-
warp_pattern = r"warpSize:\s+(\d+)"
124-
else:
125-
cmd = ["rocminfo"]
126-
warp_pattern = r"Wavefront Size:\s+([0-9]{2})\(0x[0-9]{2}\)"
127-
128-
result = subprocess.run(cmd, capture_output=True, text=True)
129-
match = re.search(warp_pattern, result.stdout)
130-
if match:
131-
return int(match.group(1))
132-
else:
133-
# default to 64 to be safe
134-
return 64
135-
else:
136-
# nvidia cards always use 32 warp size
137-
return 32
138-
except Exception as e:
139-
logger.error(f"Could not detect ROCm warp size: {e}. Defaulting to 64. (some 4-bit functions may not work!)")
140-
if torch.cuda.is_available():
141-
logger.warning(
142-
"""
143-
ROCm warp size detection failed despite ROCm being available.
144-
""",
145-
)
146-
return 64

bitsandbytes/diagnostics/cuda.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,10 @@
3232
}
3333

3434
CUDA_RUNTIME_LIB_PATTERNS = (
35-
("libamdhip64.so*",)
35+
(
36+
"libamdhip64.so*", # Linux
37+
"amdhip64*.dll", # Windows
38+
)
3639
if HIP_ENVIRONMENT
3740
else (
3841
"cudart64*.dll", # Windows

docs/source/errors.mdx

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,36 @@ Make sure this path is appended to the `LD_LIBRARY_PATH` so bnb can find the CUD
2020
If this does not fix the issue, please try compilation from source next.
2121

2222
If this does not work, please open an issue and paste the printed environment if you call `make` and the associated error when running bnb.
23+
24+
## Library not found: version mismatch
25+
26+
If you see an error like `Library not found: libbitsandbytes_cuda128.dll` or `libbitsandbytes_rocm72.so`, it means the pre-compiled library version doesn't match the CUDA/ROCm version reported by your PyTorch installation.
27+
28+
The library filename encodes the version: `libbitsandbytes_cuda{major}{minor}` for CUDA, `libbitsandbytes_rocm{major}{minor}` for ROCm. bitsandbytes picks which one to load based on what PyTorch reports:
29+
30+
```python
31+
import torch
32+
print(torch.version.cuda) # e.g. "12.8" -> looks for libbitsandbytes_cuda128
33+
print(torch.version.hip) # e.g. "7.2" -> looks for libbitsandbytes_rocm72
34+
```
35+
36+
This commonly happens when your PyTorch was compiled against a different CUDA/ROCm version than what you have installed on your system. For example, PyTorch built with ROCm 7.2 reports `torch.version.hip = "7.2"` and bitsandbytes looks for `libbitsandbytes_rocm72`, even if your system has a different ROCm version installed.
37+
38+
To resolve this:
39+
40+
1. **Install a matching PyTorch version** that aligns with the pre-compiled libraries shipped in the bitsandbytes wheel.
41+
2. **Override the version at runtime** with an environment variable so bitsandbytes loads a different library:
42+
```bash
43+
# Linux / macOS
44+
export BNB_CUDA_VERSION=128 # or BNB_ROCM_VERSION=72
45+
46+
# Windows (cmd)
47+
set BNB_CUDA_VERSION=128
48+
```
49+
3. **Compile from source** to produce a library matching your exact toolkit version. For ROCm, you can override the library name with `-DROCM_VERSION`:
50+
```bash
51+
cmake -DCOMPUTE_BACKEND=hip -DROCM_VERSION=72 -S . # produces libbitsandbytes_rocm72
52+
```
53+
For CUDA, the version is detected automatically from the CUDA compiler on your PATH and cannot be overridden -- make sure the correct CUDA Toolkit is first on your PATH.
54+
55+
See the [installation guide](installation) for full compile-from-source instructions.

docs/source/installation.mdx

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,10 @@ pip install -e .
187187

188188
* Support for AMD GPUs is currently in a preview state.
189189
* All features are supported for both consumer RDNA devices and Data Center CDNA products.
190-
* A compatible PyTorch version with AMD ROCm support is required. It is recommended to use the latest stable release. See [PyTorch on ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html) for guidance.
190+
* A compatible PyTorch version with AMD ROCm support is required. It is recommended to use the latest stable release. On Linux, see [PyTorch on ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/3rd-party/pytorch-install.html) for guidance. On Windows, ROCm-enabled PyTorch wheels are available from:
191+
- [repo.radeon.com/rocm/windows/](https://repo.radeon.com/rocm/windows/) — official AMD releases
192+
- [repo.amd.com/rocm/whl/](https://repo.amd.com/rocm/whl/)[TheRock](https://github.com/ROCm/TheRock) release builds
193+
- [rocm.nightlies.amd.com/v2](https://rocm.nightlies.amd.com/v2) — TheRock nightly builds
191194

192195
### Installation from PyPI[[rocm-pip]]
193196

@@ -203,8 +206,7 @@ The currently distributed `bitsandbytes` are built with the following configurat
203206
| **Linux x86-64** | 7.0.2 | CDNA: gfx90a, gfx942, gfx950 / RDNA: gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152, gfx1153, gfx1200, gfx1201
204207
| **Linux x86-64** | 7.1.0 | CDNA: gfx90a, gfx942, gfx950 / RDNA: gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152, gfx1153, gfx1200, gfx1201
205208
| **Linux x86-64** | 7.2.1 | CDNA: gfx90a, gfx942, gfx950 / RDNA: gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152, gfx1153, gfx1200, gfx1201
206-
207-
**Windows is not currently supported.**
209+
| **Windows x86-64** | 7.2.1 | RDNA: gfx1100, gfx1101, gfx1102, gfx1150, gfx1151, gfx1200, gfx1201
208210

209211
Use `pip` or `uv` to install the latest release:
210212

@@ -214,12 +216,18 @@ pip install bitsandbytes
214216

215217
### Compile from Source[[rocm-compile]]
216218

217-
bitsandbytes can be compiled from ROCm 6.2 - ROCm 7.2.1.
219+
bitsandbytes can be compiled from ROCm 6.2 - ROCm 7.2.1. See the `CMakeLists.txt` for additional options.
220+
221+
<hfoptions id="rocm-source">
222+
<hfoption id="Linux">
218223

219-
To compile from source, you need CMake >= **3.31.6**.
224+
To compile from source, you need CMake >= **3.31.6** and Python >= **3.10** installed. Make sure you have a compiler installed to compile C++ (`gcc`, `make`, headers, etc.).
225+
226+
You should also have a ROCm installation (system-wide or via Docker). The current minimum supported version is **6.2**.
220227

221228
```bash
222229
# Install bitsandbytes from source
230+
223231
# Clone bitsandbytes repo
224232
git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
225233

@@ -230,6 +238,36 @@ make
230238
pip install -e . # `-e` for "editable" install, when developing BNB (otherwise leave that out)
231239
```
232240

241+
</hfoption>
242+
<hfoption id="Windows">
243+
244+
Compilation on Windows requires Visual Studio with C++ support, CMake, Ninja, and Python >= **3.10**.
245+
246+
Instead of a system-wide ROCm installation, you can use the pip-installable ROCm SDK wheels from [repo.radeon.com](https://repo.radeon.com/rocm/windows/):
247+
248+
```bash
249+
# Install ROCm SDK wheels (adjust version as needed)
250+
pip install ninja cmake
251+
pip install \
252+
https://repo.radeon.com/rocm/windows/rocm-rel-7.2.1/rocm_sdk_core-7.2.1-py3-none-win_amd64.whl \
253+
https://repo.radeon.com/rocm/windows/rocm-rel-7.2.1/rocm_sdk_devel-7.2.1-py3-none-win_amd64.whl \
254+
https://repo.radeon.com/rocm/windows/rocm-rel-7.2.1/rocm_sdk_libraries_custom-7.2.1-py3-none-win_amd64.whl \
255+
https://repo.radeon.com/rocm/windows/rocm-rel-7.2.1/rocm-7.2.1.tar.gz
256+
257+
# Expand the devel tarball
258+
rocm-sdk init
259+
260+
# Set ROCM_PATH and activate Visual Studio environment, then build
261+
export ROCM_PATH="$(rocm-sdk path --root)"
262+
export PATH="${ROCM_PATH}/bin:${PATH}"
263+
git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
264+
cmake -G Ninja -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH="gfx1100" -DCMAKE_BUILD_TYPE=Release -S .
265+
cmake --build . --config Release
266+
pip install .
267+
```
268+
</hfoption>
269+
</hfoptions>
270+
233271
## Preview Wheels[[preview-wheels]]
234272

235273
If you would like to use new features even before they are officially released and help us test them, feel free to install the wheel directly from our CI (*the wheel links will remain stable!*):

tests/test_linear4bit.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -502,7 +502,7 @@ def test_params4bit_quant_state_attr_access(device, quant_type, compress_statist
502502

503503
@pytest.mark.skipif(not torch.cuda.is_available(), reason="FSDP requires CUDA")
504504
@pytest.mark.skipif(
505-
not torch.distributed.is_nccl_available(),
505+
not getattr(torch.distributed, "is_nccl_available", lambda: False)(),
506506
reason="FSDP test requires NCCL backend",
507507
)
508508
def test_fsdp_state_dict_save_4bit():

0 commit comments

Comments
 (0)