Skip to content

Commit 6d9b69b

Browse files
pnunna93akxTitus-von-Koeller
authored
Enable bitsandbytes packaging for ROCm (#1299)
* Add build job for rocm * Add rocm build script * Copy shared obj file into output_dir * upload build artifacts and enable wheels build * Remove cuda build temporarily * Add ROCm version to .so filename * Add rocm_version to whls build * Revert "Remove cuda build temporarily" This reverts commit 1413c5f. * Add rocm_version env var * Remove thrush header files * Print node info * print cuda node info * Revert "print cuda node info" This reverts commit cdb209a. * Revert "Print node info" This reverts commit 7e9a65c. * Add rocm arch to compile command * Rename .so files to rocm * Update default gpu arch * Skip cpu based igemmlt int tests on ROCm * Update Documentation * Update upstream repo name * Update docs * Update string format Co-authored-by: Aarni Koskela <akx@iki.fi> * Remove pre-release option for torch install * Update pytorch install path Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com> --------- Co-authored-by: Aarni Koskela <akx@iki.fi> Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
1 parent 1775035 commit 6d9b69b

8 files changed

Lines changed: 43 additions & 22 deletions

File tree

.github/scripts/build-rocm.sh

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,21 @@
11
#!/bin/bash
22
declare build_arch
33
declare build_os
4+
declare rocm_version
45

56
set -xeuo pipefail
7+
bnb_rocm_arch="gfx90a;gfx942;gfx1100"
68
if [ "${build_os:0:6}" == ubuntu ]; then
7-
image=rocm/dev-ubuntu-22.04:6.1-complete
9+
image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
810
echo "Using image $image"
911
docker run --rm --platform "linux/$build_arch" -i \
1012
-w /src -v "$PWD:/src" "$image" sh -c \
1113
"apt-get update \
1214
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
13-
&& cmake -DCOMPUTE_BACKEND=hip . \
15+
&& cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
1416
&& cmake --build ."
1517
fi
1618

17-
#output_dir="output/${build_os}/${build_arch}"
18-
#mkdir -p "${output_dir}"
19-
#(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")
19+
output_dir="output/${build_os}/${build_arch}"
20+
mkdir -p "${output_dir}"
21+
(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")

.github/workflows/python-package.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ jobs:
106106
matrix:
107107
os: [ubuntu-latest]
108108
arch: [x86_64]
109+
rocm_version:
110+
["6.1.2"]
109111
runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
110112
steps:
111113
- uses: actions/checkout@v4
@@ -123,10 +125,18 @@ jobs:
123125
env:
124126
build_os: ${{ matrix.os }}
125127
build_arch: ${{ matrix.arch }}
128+
rocm_version: ${{ matrix.rocm_version }}
129+
- name: Upload build artifact
130+
uses: actions/upload-artifact@v4
131+
with:
132+
name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }}
133+
path: output/*
134+
retention-days: 7
126135
build-wheels:
127136
needs:
128137
- build-shared-libs
129138
- build-shared-libs-cuda
139+
- build-shared-libs-rocm
130140
strategy:
131141
matrix:
132142
os: [ubuntu-latest, macos-latest, windows-latest]

CMakeLists.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ elseif(BUILD_HIP)
185185
set(CMAKE_HIP_ARCHITECTURES ${BNB_ROCM_ARCH})
186186
else()
187187
if (NOT AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
188-
set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx940;gfx941;gfx942")
188+
set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100")
189189
elseif (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
190190
set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
191191
endif()
@@ -194,12 +194,14 @@ elseif(BUILD_HIP)
194194

195195
list(APPEND SRC_FILES ${HIP_FILES})
196196

197-
string(APPEND BNB_OUTPUT_NAME "_hip")
197+
string(APPEND BNB_OUTPUT_NAME "_rocm")
198198

199199
# get hip version
200200
execute_process(COMMAND hipconfig --version OUTPUT_VARIABLE HIP_CONFIG_VERSION)
201201
string(REGEX MATCH "[0-9]+\\.[0-9]+" HIP_VERSION "${HIP_CONFIG_VERSION}")
202+
string(REPLACE "." "" HIP_VERSION_SHORT "${HIP_VERSION}")
202203

204+
string(APPEND BNB_OUTPUT_NAME "${HIP_VERSION_SHORT}")
203205
if(NO_CUBLASLT OR HIP_VERSION VERSION_LESS "6.1")
204206
string(APPEND BNB_OUTPUT_NAME "_nohipblaslt")
205207
endif()

bitsandbytes/cextension.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,9 @@ def get_cuda_bnb_library_path(cuda_specs: CUDASpecs) -> Path:
3838
"""
3939
if torch.version.hip:
4040
if BNB_HIP_VERSION < 601:
41-
return PACKAGE_DIR / f"libbitsandbytes_hip_nohipblaslt{DYNAMIC_LIBRARY_SUFFIX}"
41+
return PACKAGE_DIR / f"libbitsandbytes_rocm{BNB_HIP_VERSION_SHORT}_nohipblaslt{DYNAMIC_LIBRARY_SUFFIX}"
4242
else:
43-
return PACKAGE_DIR / f"libbitsandbytes_hip{DYNAMIC_LIBRARY_SUFFIX}"
43+
return PACKAGE_DIR / f"libbitsandbytes_rocm{BNB_HIP_VERSION_SHORT}{DYNAMIC_LIBRARY_SUFFIX}"
4444
library_name = f"libbitsandbytes_cuda{cuda_specs.cuda_version_string}"
4545
if not cuda_specs.has_cublaslt:
4646
# if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt
@@ -119,8 +119,10 @@ def get_native_library() -> BNBNativeLibrary:
119119
if torch.version.hip:
120120
hip_major, hip_minor = map(int, torch.version.hip.split(".")[0:2])
121121
HIP_ENVIRONMENT, BNB_HIP_VERSION = True, hip_major * 100 + hip_minor
122+
BNB_HIP_VERSION_SHORT = f"{hip_major}{hip_minor}"
122123
else:
123124
HIP_ENVIRONMENT, BNB_HIP_VERSION = False, 0
125+
BNB_HIP_VERSION_SHORT = ""
124126
lib = get_native_library()
125127
except Exception as e:
126128
lib = None

csrc/kernels.hip

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@
1010
#include <hipcub/hipcub.hpp>
1111
#include <hip/hip_math_constants.h>
1212

13-
#include <thrust/host_vector.h>
14-
#include <thrust/device_vector.h>
1513
//#include <mma.h>
1614

1715

csrc/ops_hip.cuh

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,6 @@
2121
#include <vector>
2222
#include <functional>
2323

24-
/*
25-
#include <thrust/host_vector.h>
26-
#include <thrust/device_vector.h>
27-
*/
28-
29-
3024
#define CUDA_CHECK_RETURN(value) { \
3125
hipError_t _m_cudaStat = value; \
3226
if (_m_cudaStat != hipSuccess) { \

docs/source/installation.mdx

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -146,15 +146,25 @@ Please follow these steps to install bitsandbytes with device-specific backend s
146146
bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
147147

148148
> [!TIP]
149-
> If you already installed ROCm and PyTorch, skip Docker steps below and please check that the torch version matches your ROCm install. To install torch for a specific ROCm version, please refer to step 3 of wheels install in [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) guide.
149+
> If you would like to install ROCm and PyTorch on bare metal, skip Docker steps and refer to our official guides at [ROCm installation overview](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/install-overview.html#rocm-install-overview) and [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) (Step 3 of wheels build for quick installation). Please make sure to get PyTorch wheel for the installed ROCm version.
150150
151151
```bash
152-
# Create a docker container with latest pytorch. It comes with ROCm and pytorch preinstalled
153-
docker pull rocm/pytorch:latest
154-
docker run -it --device=/dev/kfd --device=/dev/dri --group-add video rocm/pytorch:latest
152+
# Create a docker container with latest ROCm image, which includes ROCm libraries
153+
docker pull rocm/dev-ubuntu-22.04:6.1.2-complete
154+
docker run -it --device=/dev/kfd --device=/dev/dri --group-add video rocm/dev-ubuntu-22.04:6.1.2-complete
155+
apt-get update && apt-get install -y git && cd home
155156

157+
# Install pytorch compatible with above ROCm version
158+
pip install torch --index-url https://download.pytorch.org/whl/rocm6.1/
159+
160+
# Install bitsandbytes from PyPI
161+
# (This is supported on Ubuntu 22.04, Python 3.10, ROCm 6.1.0/6.1.1/6.1.2 and gpu arch - gfx90a, gfx942, gfx1100
162+
# Please install from source if your configuration doesn't match with these)
163+
pip install bitsandbytes
164+
165+
# Install bitsandbytes from source
156166
# Clone bitsandbytes repo, ROCm backend is currently enabled on multi-backend-refactor branch
157-
git clone --depth 1 -b multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
167+
git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
158168

159169
# Install dependencies
160170
pip install -r requirements-dev.txt

tests/test_functional.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -584,6 +584,9 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans
584584
@pytest.mark.parametrize("ldb", (0,), ids=id_formatter("ldb"))
585585
@pytest.mark.parametrize("device", ("cuda", "cpu"), ids=id_formatter("device"))
586586
def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb, device):
587+
if HIP_ENVIRONMENT and device == "cpu":
588+
pytest.skip("this test is not supported on ROCm yet")
589+
587590
for i in range(k):
588591
if dims == 2:
589592
A = torch.randint(-128, 127, size=(dim1, dim3), device=device).to(torch.int8)

0 commit comments

Comments
 (0)