Enable bitsandbytes packaging for ROCm (#1299)

pnunna93 · akx · Titus-von-Koeller · web-flow · commit 6d9b69b626bf · 2024-08-02T14:43:55.000+02:00
* Add build job for rocm * Add rocm build script * Copy shared obj file into output_dir * upload build artifacts and enable wheels build * Remove cuda build temporarily * Add ROCm version to .so filename * Add rocm_version to whls build * Revert "Remove cuda build temporarily" This reverts commit 1413c5f. * Add rocm_version env var * Remove thrush header files * Print node info * print cuda node info * Revert "print cuda node info" This reverts commit cdb209a. * Revert "Print node info" This reverts commit 7e9a65c. * Add rocm arch to compile command * Rename .so files to rocm * Update default gpu arch * Skip cpu based igemmlt int tests on ROCm * Update Documentation * Update upstream repo name * Update docs * Update string format Co-authored-by: Aarni Koskela <akx@iki.fi> * Remove pre-release option for torch install * Update pytorch install path Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com> --------- Co-authored-by: Aarni Koskela <akx@iki.fi> Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
@@ -1,19 +1,21 @@
 #!/bin/bash
 declare build_arch
 declare build_os
+declare rocm_version
 
 set -xeuo pipefail
+bnb_rocm_arch="gfx90a;gfx942;gfx1100"
 if [ "${build_os:0:6}" == ubuntu ]; then
-	image=rocm/dev-ubuntu-22.04:6.1-complete
+	image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
 	echo "Using image $image"
 	docker run --rm --platform "linux/$build_arch" -i \
 		-w /src -v "$PWD:/src" "$image" sh -c \
 		"apt-get update \
       && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
-      && cmake -DCOMPUTE_BACKEND=hip . \
+      && cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
       && cmake --build ."
 fi
 
-#output_dir="output/${build_os}/${build_arch}"
-#mkdir -p "${output_dir}"
-#(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")
+output_dir="output/${build_os}/${build_arch}"
+mkdir -p "${output_dir}"
+(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -106,6 +106,8 @@ jobs:
       matrix:
         os: [ubuntu-latest]
         arch: [x86_64]
+        rocm_version:
+          ["6.1.2"]
     runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
     steps:
       - uses: actions/checkout@v4
@@ -123,10 +125,18 @@ jobs:
         env:
           build_os: ${{ matrix.os }}
           build_arch: ${{ matrix.arch }}
+          rocm_version: ${{ matrix.rocm_version }}
+      - name: Upload build artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }}
+          path: output/*
+          retention-days: 7
   build-wheels:
     needs:
       - build-shared-libs
       - build-shared-libs-cuda
+      - build-shared-libs-rocm
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -185,7 +185,7 @@ elseif(BUILD_HIP)
       set(CMAKE_HIP_ARCHITECTURES ${BNB_ROCM_ARCH})
     else()
       if (NOT AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
-        set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx940;gfx941;gfx942")
+        set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100")
       elseif (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
         set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
       endif()
@@ -194,12 +194,14 @@ elseif(BUILD_HIP)
 
     list(APPEND SRC_FILES ${HIP_FILES})
 
-    string(APPEND BNB_OUTPUT_NAME "_hip")
+    string(APPEND BNB_OUTPUT_NAME "_rocm")
 
     # get hip version
     execute_process(COMMAND hipconfig --version OUTPUT_VARIABLE HIP_CONFIG_VERSION)
     string(REGEX MATCH "[0-9]+\\.[0-9]+" HIP_VERSION "${HIP_CONFIG_VERSION}")
+    string(REPLACE "." "" HIP_VERSION_SHORT "${HIP_VERSION}")
 
+    string(APPEND BNB_OUTPUT_NAME "${HIP_VERSION_SHORT}")
     if(NO_CUBLASLT OR HIP_VERSION VERSION_LESS "6.1")
         string(APPEND BNB_OUTPUT_NAME "_nohipblaslt")
     endif()
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
@@ -38,9 +38,9 @@ def get_cuda_bnb_library_path(cuda_specs: CUDASpecs) -> Path:
     """
     if torch.version.hip:
         if BNB_HIP_VERSION < 601:
-            return PACKAGE_DIR / f"libbitsandbytes_hip_nohipblaslt{DYNAMIC_LIBRARY_SUFFIX}"
+            return PACKAGE_DIR / f"libbitsandbytes_rocm{BNB_HIP_VERSION_SHORT}_nohipblaslt{DYNAMIC_LIBRARY_SUFFIX}"
         else:
-            return PACKAGE_DIR / f"libbitsandbytes_hip{DYNAMIC_LIBRARY_SUFFIX}"
+            return PACKAGE_DIR / f"libbitsandbytes_rocm{BNB_HIP_VERSION_SHORT}{DYNAMIC_LIBRARY_SUFFIX}"
     library_name = f"libbitsandbytes_cuda{cuda_specs.cuda_version_string}"
     if not cuda_specs.has_cublaslt:
         # if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt
@@ -119,8 +119,10 @@ def get_native_library() -> BNBNativeLibrary:
     if torch.version.hip:
         hip_major, hip_minor = map(int, torch.version.hip.split(".")[0:2])
         HIP_ENVIRONMENT, BNB_HIP_VERSION = True, hip_major * 100 + hip_minor
+        BNB_HIP_VERSION_SHORT = f"{hip_major}{hip_minor}"
     else:
         HIP_ENVIRONMENT, BNB_HIP_VERSION = False, 0
+        BNB_HIP_VERSION_SHORT = ""
     lib = get_native_library()
 except Exception as e:
     lib = None
diff --git a/csrc/kernels.hip b/csrc/kernels.hip
@@ -10,8 +10,6 @@
 #include <hipcub/hipcub.hpp>
 #include <hip/hip_math_constants.h>
 
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
 //#include <mma.h>
 
 
diff --git a/csrc/ops_hip.cuh b/csrc/ops_hip.cuh
@@ -21,12 +21,6 @@
 #include <vector>
 #include <functional>
 
-/*
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
-*/
-
-
 #define CUDA_CHECK_RETURN(value) {                      \
   hipError_t _m_cudaStat = value;                    \
   if (_m_cudaStat != hipSuccess) {                   \
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
@@ -146,15 +146,25 @@ Please follow these steps to install bitsandbytes with device-specific backend s
 bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
 
 > [!TIP]
-> If you already installed ROCm and PyTorch, skip Docker steps below and please check that the torch version matches your ROCm install. To install torch for a specific ROCm version, please refer to step 3 of wheels install in [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) guide.
+> If you would like to install ROCm and PyTorch on bare metal, skip Docker steps and refer to our official guides at [ROCm installation overview](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/install-overview.html#rocm-install-overview) and [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) (Step 3 of wheels build for quick installation). Please make sure to get PyTorch wheel for the installed ROCm version.
 
 ```bash
-# Create a docker container with latest pytorch. It comes with ROCm and pytorch preinstalled
-docker pull rocm/pytorch:latest
-docker run -it --device=/dev/kfd --device=/dev/dri --group-add video rocm/pytorch:latest
+# Create a docker container with latest ROCm image, which includes ROCm libraries
+docker pull rocm/dev-ubuntu-22.04:6.1.2-complete
+docker run -it --device=/dev/kfd --device=/dev/dri --group-add video rocm/dev-ubuntu-22.04:6.1.2-complete
+apt-get update && apt-get install -y git && cd home
 
+# Install pytorch compatible with above ROCm version
+pip install torch --index-url https://download.pytorch.org/whl/rocm6.1/
+
+# Install bitsandbytes from PyPI
+# (This is supported on Ubuntu 22.04, Python 3.10, ROCm 6.1.0/6.1.1/6.1.2 and gpu arch - gfx90a, gfx942, gfx1100
+# Please install from source if your configuration doesn't match with these)
+pip install bitsandbytes
+
+# Install bitsandbytes from source
 # Clone bitsandbytes repo, ROCm backend is currently enabled on multi-backend-refactor branch
-git clone --depth 1 -b multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 
 # Install dependencies
 pip install -r requirements-dev.txt
diff --git a/tests/test_functional.py b/tests/test_functional.py
@@ -584,6 +584,9 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans
 @pytest.mark.parametrize("ldb", (0,), ids=id_formatter("ldb"))
 @pytest.mark.parametrize("device", ("cuda", "cpu"), ids=id_formatter("device"))
 def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb, device):
+    if HIP_ENVIRONMENT and device == "cpu":
+        pytest.skip("this test is not supported on ROCm yet")
+
     for i in range(k):
         if dims == 2:
             A = torch.randint(-128, 127, size=(dim1, dim3), device=device).to(torch.int8)