Skip to content

Commit ae18ee4

Browse files
authored
Merge branch 'main' into fix/matmul4bit-gemv-shape-guard
2 parents 4aaf727 + 74994ef commit ae18ee4

27 files changed

Lines changed: 1328 additions & 238 deletions

.github/scripts/build-rocm.sh

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,33 @@ if [ "${build_os:0:6}" == ubuntu ]; then
2121
&& pip install cmake==3.31.6 \
2222
&& cmake -DCOMPUTE_BACKEND=hip -DCMAKE_BUILD_TYPE=MinSizeRel -DCMAKE_HIP_FLAGS=\"--offload-compress\" -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
2323
&& cmake --build ."
24+
else
25+
bnb_rocm_arch="gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201"
26+
27+
pip install ninja cmake==3.31.6
28+
29+
# Install ROCm SDK wheels from repo.radeon.com.
30+
rocm_base_url="https://repo.radeon.com/rocm/windows/rocm-rel-${rocm_version}"
31+
pip install \
32+
"${rocm_base_url}/rocm_sdk_core-${rocm_version}-py3-none-win_amd64.whl" \
33+
"${rocm_base_url}/rocm_sdk_devel-${rocm_version}-py3-none-win_amd64.whl" \
34+
"${rocm_base_url}/rocm_sdk_libraries_custom-${rocm_version}-py3-none-win_amd64.whl" \
35+
"${rocm_base_url}/rocm-${rocm_version}.tar.gz"
36+
37+
# Expand the devel tarball
38+
rocm-sdk init
39+
40+
ROCM_PATH="$(rocm-sdk path --root)"
41+
export ROCM_PATH
42+
export PATH="${ROCM_PATH}/bin:${PATH}"
43+
44+
cmake -G Ninja \
45+
-DCOMPUTE_BACKEND=hip \
46+
-DBNB_ROCM_ARCH="${bnb_rocm_arch}" \
47+
-DCMAKE_BUILD_TYPE=MinSizeRel \
48+
-DCMAKE_HIP_FLAGS="--offload-compress" \
49+
-S .
50+
cmake --build .
2451
fi
2552

2653
output_dir="output/${build_os}/${build_arch}"

.github/workflows/python-package.yml

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,11 +136,16 @@ jobs:
136136
matrix:
137137
os: [ubuntu-22.04]
138138
arch: [x86_64]
139-
rocm_version: ["6.2.4", "6.3.4", "6.4.4", "7.0.2", "7.1", "7.2"]
139+
rocm_version: ["6.2.4", "6.3.4", "6.4.4", "7.0.2", "7.1", "7.2.1"]
140+
include:
141+
- os: windows-2025
142+
arch: x86_64
143+
rocm_version: "7.2.1"
140144
runs-on: ${{ matrix.os }}
141145
steps:
142146
- uses: actions/checkout@v4
143147
- name: Clean up disk space
148+
if: startsWith(matrix.os, 'ubuntu')
144149
run: |
145150
echo "Disk space before cleanup:"
146151
df -h
@@ -156,6 +161,9 @@ jobs:
156161
157162
echo "Disk space after cleanup:"
158163
df -h
164+
- name: Setup MSVC
165+
if: startsWith(matrix.os, 'windows')
166+
uses: ilammy/msvc-dev-cmd@v1.13.0
159167
- name: Build C++
160168
run: bash .github/scripts/build-rocm.sh
161169
env:
@@ -332,6 +340,12 @@ jobs:
332340
done
333341
334342
cat >> body.md << 'ENDOFMARKDOWN'
343+
> **Custom PyTorch builds (Intel XPU, ROCm, etc.):**
344+
> The `--force-reinstall` flag causes pip to re-resolve all dependencies from PyPI, which may replace your custom PyTorch build with the default CUDA variant. To avoid this, add `--no-deps`:
345+
> ```sh
346+
> pip install --force-reinstall --no-deps https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl
347+
> ```
348+
335349
> **Note:**
336350
> These wheels are updated automatically with every commit to `main` and become available as soon as the [python-package.yml](.github/workflows/python-package.yml) workflow finishes.
337351

.github/workflows/tests-nightly.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ jobs:
2020
platform: [linux-x64, linux-aarch64, macos, windows]
2121
# default runners don't have AVX-512 support, but icelake does
2222
cpu_type: ["", icelake]
23-
torch_version: ["2.3.1", "2.9.1", "2.10.0"]
23+
torch_version: ["2.3.1", "2.10.0", "2.11.0"]
2424

2525
exclude:
2626
# aarch64 minimum torch version is 2.5.1
@@ -71,7 +71,7 @@ jobs:
7171
torch_version: "2.9.1"
7272
pypi_index: "https://download.pytorch.org/whl/cu128"
7373
- cuda_version: "13.0.2"
74-
torch_version: "2.10.0"
74+
torch_version: "2.11.0"
7575
pypi_index: "https://download.pytorch.org/whl/cu130"
7676

7777
# Windows CUDA Tests - T4 GPU (CUDA 11.8 only, multiple torch versions)

.github/workflows/tests-pr.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ jobs:
3131
platform: [linux-x64, linux-aarch64, macos]
3232
# default runners don't have AVX-512 support, but icelake does
3333
cpu_type: ["", icelake]
34-
torch_version: ["2.3.1", "2.10.0"]
34+
torch_version: ["2.3.1", "2.11.0"]
3535

3636
exclude:
3737
# aarch64 minimum torch version is 2.5.1
@@ -76,7 +76,7 @@ jobs:
7676
torch_version: "2.9.1"
7777
pypi_index: "https://download.pytorch.org/whl/cu128"
7878
- cuda_version: "13.0.2"
79-
torch_version: "2.10.0"
79+
torch_version: "2.11.0"
8080
pypi_index: "https://download.pytorch.org/whl/cu130"
8181

8282
# Windows CUDA test - single configuration

CMakeLists.txt

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,17 @@ cmake_minimum_required(VERSION 3.22.1)
1919
# On Windows with HIP backend, auto-detect compilers from ROCM_PATH before project()
2020
if(WIN32 AND COMPUTE_BACKEND STREQUAL "hip")
2121
if(DEFINED ENV{ROCM_PATH})
22-
set(ROCM_PATH $ENV{ROCM_PATH})
22+
file(TO_CMAKE_PATH "$ENV{ROCM_PATH}" ROCM_PATH)
2323
endif()
2424
if(ROCM_PATH AND NOT DEFINED CMAKE_CXX_COMPILER)
2525
set(CMAKE_CXX_COMPILER "${ROCM_PATH}/lib/llvm/bin/clang++.exe")
2626
endif()
2727
if(ROCM_PATH AND NOT DEFINED CMAKE_HIP_COMPILER)
2828
set(CMAKE_HIP_COMPILER "${ROCM_PATH}/lib/llvm/bin/clang++.exe")
2929
endif()
30+
if(NOT DEFINED HIP_PLATFORM)
31+
set(HIP_PLATFORM "amd" CACHE STRING "HIP Platform")
32+
endif()
3033
# On Windows, the HIP compiler needs explicit paths to find device libraries.
3134
if(ROCM_PATH)
3235
find_path(ROCM_DEVICE_LIB_PATH
@@ -35,9 +38,9 @@ if(WIN32 AND COMPUTE_BACKEND STREQUAL "hip")
3538
"${ROCM_PATH}/lib/llvm/amdgcn/bitcode"
3639
NO_DEFAULT_PATH
3740
)
38-
set(CMAKE_HIP_FLAGS "--rocm-path=${ROCM_PATH}")
41+
string(APPEND CMAKE_HIP_FLAGS " --rocm-path=${ROCM_PATH}")
3942
if(ROCM_DEVICE_LIB_PATH)
40-
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} --rocm-device-lib-path=${ROCM_DEVICE_LIB_PATH}")
43+
string(APPEND CMAKE_HIP_FLAGS " --rocm-device-lib-path=${ROCM_DEVICE_LIB_PATH}")
4144
endif()
4245
endif()
4346
endif()
@@ -357,7 +360,7 @@ endif()
357360
if(BUILD_HIP)
358361
# Determine ROCM_PATH from environment variable, fallback to /opt/rocm on Linux
359362
if(DEFINED ENV{ROCM_PATH})
360-
set(ROCM_PATH $ENV{ROCM_PATH})
363+
file(TO_CMAKE_PATH "$ENV{ROCM_PATH}" ROCM_PATH)
361364
else()
362365
set(ROCM_PATH /opt/rocm)
363366
endif()
@@ -416,11 +419,15 @@ if(WIN32)
416419
set_target_properties(bitsandbytes PROPERTIES PREFIX "lib")
417420
endif()
418421
set_target_properties(bitsandbytes PROPERTIES OUTPUT_NAME ${BNB_OUTPUT_NAME})
419-
if(MSVC)
420-
set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE "${PROJECT_SOURCE_DIR}/bitsandbytes")
421-
set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes")
422-
set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE "${PROJECT_SOURCE_DIR}/bitsandbytes")
423-
set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes")
422+
if(WIN32)
423+
set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/bitsandbytes")
424+
set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/bitsandbytes")
425+
if(MSVC)
426+
set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE "${PROJECT_SOURCE_DIR}/bitsandbytes")
427+
set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes")
428+
set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE "${PROJECT_SOURCE_DIR}/bitsandbytes")
429+
set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes")
430+
endif()
424431
endif()
425432

426433
set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/bitsandbytes")

README.md

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ bitsandbytes has the following minimum requirements for all platforms:
5757
<td>Minimum: AVX2<br>Optimized: AVX512F, AVX512BF16</td>
5858
<td>✅</td>
5959
<td>✅</td>
60-
<td></td>
60+
<td></td>
6161
</tr>
6262
<tr>
6363
<td></td>
@@ -88,7 +88,7 @@ bitsandbytes has the following minimum requirements for all platforms:
8888
</td>
8989
<td>✅</td>
9090
<td>✅</td>
91-
<td>〰️</td>
91+
<td></td>
9292
</tr>
9393
<tr>
9494
<td></td>
@@ -123,7 +123,7 @@ bitsandbytes has the following minimum requirements for all platforms:
123123
<td>AVX2</td>
124124
<td>✅</td>
125125
<td>✅</td>
126-
<td></td>
126+
<td></td>
127127
</tr>
128128
<tr>
129129
<td></td>
@@ -133,6 +133,18 @@ bitsandbytes has the following minimum requirements for all platforms:
133133
<td>✅</td>
134134
<td>✅</td>
135135
</tr>
136+
<tr>
137+
<td></td>
138+
<td>🟥 AMD GPU <br><code>cuda</code></td>
139+
<td>
140+
RDNA: gfx1100, gfx1101, gfx1102,<br>
141+
gfx1150, gfx1151,<br>
142+
gfx1200, gfx1201
143+
</td>
144+
<td>✅</td>
145+
<td>✅</td>
146+
<td>✅</td>
147+
</tr>
136148
<tr>
137149
<td></td>
138150
<td>🟦 Intel GPU <br><code>xpu</code></td>
@@ -142,7 +154,7 @@ bitsandbytes has the following minimum requirements for all platforms:
142154
</td>
143155
<td>✅</td>
144156
<td>✅</td>
145-
<td>〰️</td>
157+
<td></td>
146158
</tr>
147159
<tr>
148160
<td colspan="6">🍎 <strong>macOS 14+</strong></td>

bitsandbytes/autograd/_functions.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -382,10 +382,7 @@ def matmul_4bit(
382382
bias: Optional[torch.Tensor] = None,
383383
):
384384
assert quant_state is not None
385-
# Change dtype to input dtype on CPU
386385
if A.device.type == "cpu":
387-
quant_state.dtype = A.dtype
388-
389386
if getattr(quant_state, "packing_format_for_cpu", False):
390387
out = F.gemv_4bit(A, B, out, state=quant_state)
391388
if bias is not None:

0 commit comments

Comments
 (0)