Skip to content

Commit f887942

Browse files
Merge upstream/main into merge-cuda-hip
2 parents ebdda00 + ffadf57 commit f887942

22 files changed

Lines changed: 651 additions & 1608 deletions

.github/workflows/test-runner.yml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ on:
2626
gpu_type:
2727
type: string
2828
default: ""
29-
description: "GPU type for CUDA testing: T4, L40S"
29+
description: "GPU type for CUDA testing: T4, A10,L40S"
3030
# cpu_type currently only affects linux x64 CPU testing to select specific CPU architectures
3131
cpu_type:
3232
type: string
@@ -65,11 +65,14 @@ jobs:
6565
T4)
6666
TEST_RUNNER="bandb-aws-g4dn-4xlarge-plus-use1-public-80"
6767
;;
68+
A10)
69+
TEST_RUNNER="bandb-aws-g5-4xlarge-plus-use1-public-80"
70+
;;
6871
L40S)
6972
TEST_RUNNER="bandb-aws-g6e-4xlarge-plus-use1-public-80"
7073
;;
7174
*)
72-
echo "::error::Must specify gpu_type (T4 or L40S) for linux-x64 cuda backend"
75+
echo "::error::Must specify gpu_type (T4, A10, L40S) for linux-x64 cuda backend"
7376
exit 1
7477
;;
7578
esac
@@ -164,7 +167,7 @@ jobs:
164167
run: bash .github/scripts/build-cuda.sh
165168
env:
166169
cuda_version: ${{ inputs.cuda_version }}
167-
cuda_targets: "75;89"
170+
cuda_targets: "75;80;89"
168171

169172
- name: Upload build artifact
170173
uses: actions/upload-artifact@v4

.github/workflows/tests-nightly.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ jobs:
5656
matrix:
5757
# Linux x64 cross-product
5858
platform: [linux-x64]
59-
gpu_type: [T4, L40S]
59+
gpu_type: [T4, A10, L40S]
6060
cuda_version: ["11.8.0", "12.6.3", "12.8.1", "13.0.2"]
6161

6262
include:

.github/workflows/tests-pr.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ jobs:
6464
fail-fast: false
6565
matrix:
6666
platform: [linux-x64]
67-
gpu_type: [T4, L40S]
67+
gpu_type: [T4, A10, L40S]
6868
cuda_version: ["11.8.0", "12.8.1", "13.0.2"]
6969

7070
include:

CLAUDE.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@ git worktree add ~/git/bnb-fix-<NUMBER> -b fix/issue-<NUMBER>
88
cd ~/git/bnb-fix-<NUMBER>
99
```
1010

11-
This keeps the main checkout clean and allows parallel sessions. If you are already inside a worktree directory, you do not need to create another one. Full guide: `agents/worktree_guide.md`
11+
This keeps the main checkout clean and allows parallel sessions. If you are already inside a worktree directory, you do not need to create another one.
12+
13+
**Before creating a worktree**, check the worktree registry for existing ones — see the Git Worktrees section in `~/.claude/CLAUDE.md`. Bitsandbytes-specific naming conventions: `agents/worktree_guide.md`. General worktree guide: `~/git/lab_tools/worktree_guide.md`.
1214

1315
# MANDATORY: Check for existing PRs before starting work
1416

CMakeLists.txt

Lines changed: 66 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,38 @@
1010
# Separate by semicolons, i.e. `-DCOMPUTE_CAPABILITY=89;90;100;120`
1111
# Check your compute capability here: https://developer.nvidia.com/cuda-gpus
1212
# - PTXAS_VERBOSE: Pass the `-v` option to the PTX Assembler
13+
# - ROCM_VERSION: Override the ROCm version shortcode used in the output library name.
14+
# Useful when PyTorch was built against a different ROCm version than the
15+
# system install. For example, `-DROCM_VERSION=70` produces
16+
# libbitsandbytes_rocm70.so even if the system has ROCm 7.2.
1317
cmake_minimum_required(VERSION 3.22.1)
1418

19+
# On Windows with HIP backend, auto-detect compilers from ROCM_PATH before project()
20+
if(WIN32 AND COMPUTE_BACKEND STREQUAL "hip")
21+
if(DEFINED ENV{ROCM_PATH})
22+
set(ROCM_PATH $ENV{ROCM_PATH})
23+
endif()
24+
if(ROCM_PATH AND NOT DEFINED CMAKE_CXX_COMPILER)
25+
set(CMAKE_CXX_COMPILER "${ROCM_PATH}/lib/llvm/bin/clang++.exe")
26+
endif()
27+
if(ROCM_PATH AND NOT DEFINED CMAKE_HIP_COMPILER)
28+
set(CMAKE_HIP_COMPILER "${ROCM_PATH}/lib/llvm/bin/clang++.exe")
29+
endif()
30+
# On Windows, the HIP compiler needs explicit paths to find device libraries.
31+
if(ROCM_PATH)
32+
find_path(ROCM_DEVICE_LIB_PATH
33+
NAMES oclc_abi_version_400.bc ocml.bc
34+
PATHS "${ROCM_PATH}/amdgcn/bitcode"
35+
"${ROCM_PATH}/lib/llvm/amdgcn/bitcode"
36+
NO_DEFAULT_PATH
37+
)
38+
set(CMAKE_HIP_FLAGS "--rocm-path=${ROCM_PATH}")
39+
if(ROCM_DEVICE_LIB_PATH)
40+
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} --rocm-device-lib-path=${ROCM_DEVICE_LIB_PATH}")
41+
endif()
42+
endif()
43+
endif()
44+
1545
project(bitsandbytes LANGUAGES CXX)
1646

1747
# If run without specifying a build type, default to using the Release configuration:
@@ -199,17 +229,18 @@ if(BUILD_CUDA)
199229
string(APPEND BNB_OUTPUT_NAME "_cuda${CUDA_VERSION_SHORT}")
200230
add_compile_definitions(BUILD_CUDA)
201231
elseif(BUILD_HIP)
202-
enable_language(HIP)
203-
message(STATUS "HIP Compiler: ${CMAKE_HIP_COMPILER}")
232+
# Set target architectures before enable_language(HIP), which would otherwise
233+
# auto-detect a single GPU and override the defaults.
204234
if(DEFINED BNB_ROCM_ARCH)
205235
set(CMAKE_HIP_ARCHITECTURES ${BNB_ROCM_ARCH})
206-
else()
207-
if (NOT AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
208-
set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100;gfx1101;gfx1150;gfx1151;gfx1200;gfx1201")
209-
elseif (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
210-
set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
211-
endif()
236+
elseif(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
237+
set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
238+
elseif(NOT CMAKE_HIP_ARCHITECTURES)
239+
set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100;gfx1101;gfx1150;gfx1151;gfx1200;gfx1201")
212240
endif()
241+
242+
enable_language(HIP)
243+
message(STATUS "HIP Compiler: ${CMAKE_HIP_COMPILER}")
213244
message(STATUS "HIP Targets: ${CMAKE_HIP_ARCHITECTURES}")
214245

215246
list(APPEND SRC_FILES ${GPU_FILES})
@@ -221,7 +252,15 @@ elseif(BUILD_HIP)
221252
string(REGEX MATCH "[0-9]+\\.[0-9]+" HIP_VERSION "${HIP_CONFIG_VERSION}")
222253
string(REPLACE "." "" HIP_VERSION_SHORT "${HIP_VERSION}")
223254

224-
string(APPEND BNB_OUTPUT_NAME "${HIP_VERSION_SHORT}")
255+
# Expose a cache variable that the user can set to override the ROCm version in the library name
256+
set(ROCM_VERSION "${HIP_VERSION_SHORT}" CACHE STRING "Expected ROCm Version Shortcode")
257+
258+
message(STATUS "ROCm Version: ${HIP_VERSION_SHORT} (from hipconfig)")
259+
if(NOT ROCM_VERSION STREQUAL "${HIP_VERSION_SHORT}")
260+
message(WARNING "Overriding ROCm version in library name: ${HIP_VERSION_SHORT} -> ${ROCM_VERSION}")
261+
endif()
262+
263+
string(APPEND BNB_OUTPUT_NAME "${ROCM_VERSION}")
225264
add_compile_definitions(__HIP_PLATFORM_AMD__)
226265
add_compile_definitions(__HIP_PLATFORM_HCC__)
227266
add_compile_definitions(BUILD_HIP)
@@ -262,6 +301,8 @@ endif()
262301
if(WIN32)
263302
# Export all symbols
264303
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
304+
# Prevent Windows SDK min/max macros from conflicting with std::min/std::max
305+
add_compile_definitions(NOMINMAX)
265306
endif()
266307

267308
if(MSVC)
@@ -314,10 +355,11 @@ if(BUILD_CUDA)
314355
)
315356
endif()
316357
if(BUILD_HIP)
317-
if(NOT DEFINED ENV{ROCM_PATH})
318-
set(ROCM_PATH /opt/rocm)
319-
else()
358+
# Determine ROCM_PATH from environment variable, fallback to /opt/rocm on Linux
359+
if(DEFINED ENV{ROCM_PATH})
320360
set(ROCM_PATH $ENV{ROCM_PATH})
361+
else()
362+
set(ROCM_PATH /opt/rocm)
321363
endif()
322364
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
323365
macro(find_package_and_print_version PACKAGE_NAME)
@@ -329,14 +371,23 @@ if(BUILD_HIP)
329371
find_package_and_print_version(hipsparse REQUIRED)
330372

331373
## hacky way of excluding hip::amdhip64 (with it linked many tests unexpectedly fail e.g. adam8bit because of inaccuracies)
332-
set_target_properties(hip::host PROPERTIES INTERFACE_LINK_LIBRARIES "")
333-
set_target_properties(hip-lang::host PROPERTIES INTERFACE_LINK_LIBRARIES "")
334-
set(CMAKE_HIP_IMPLICIT_LINK_LIBRARIES "")
374+
## On Windows, we need to link amdhip64 explicitly
375+
if(NOT WIN32)
376+
set_target_properties(hip::host PROPERTIES INTERFACE_LINK_LIBRARIES "")
377+
set_target_properties(hip-lang::host PROPERTIES INTERFACE_LINK_LIBRARIES "")
378+
set(CMAKE_HIP_IMPLICIT_LINK_LIBRARIES "")
379+
endif()
335380

336381
target_include_directories(bitsandbytes PRIVATE ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/include ${ROCM_PATH}/include /include)
337382
target_link_directories(bitsandbytes PRIVATE ${ROCM_PATH}/lib /lib)
338383
target_link_libraries(bitsandbytes PUBLIC roc::hipblas hip::hiprand roc::hipsparse)
339384

385+
# On Windows, rocblas is not pulled in transitively by roc::hipblas
386+
# and is needed because ops_hip.cuh uses rocblas_handle directly.
387+
if(WIN32)
388+
target_link_libraries(bitsandbytes PUBLIC rocblas)
389+
endif()
390+
340391
target_compile_definitions(bitsandbytes PUBLIC BNB_USE_HIP)
341392
set_source_files_properties(${GPU_FILES} PROPERTIES LANGUAGE HIP)
342393
set_target_properties(bitsandbytes PROPERTIES LINKER_LANGUAGE CXX)

agents/architecture_guide.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,7 @@ GPU-specific functions are actually invoked.
329329
### Environment variables
330330

331331
- `BNB_CUDA_VERSION` — Override the auto-detected CUDA version for library selection
332+
- `BNB_ROCM_VERSION` is the ROCm equivalent
332333
- Standard CUDA env vars (`CUDA_HOME`, `LD_LIBRARY_PATH`) affect library discovery
333334

334335
---

agents/dispatch_guide.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,27 @@ python3 agents/fetch_issues.py
1212

1313
Read `agents/github_tools_guide.md` for the full reference on how to use the query tools.
1414

15+
## Step 0: Check Existing Reviews on Open PRs
16+
17+
Before looking at issues, check whether there are open PRs from external contributors that need review. But **do not assume a PR needs review just because it is open** — check whether a review has already been posted.
18+
19+
```bash
20+
# List open PRs
21+
gh pr list --state open --limit 30
22+
23+
# For each external contributor PR, check for existing reviews
24+
gh api repos/bitsandbytes-foundation/bitsandbytes/pulls/<NUMBER>/reviews \
25+
--jq '.[] | "\(.user.login) | \(.state) | \(.submitted_at)"'
26+
```
27+
28+
A PR only needs a new review if:
29+
30+
- **No review exists at all** from a maintainer or agent
31+
- **The author has pushed new commits** since the last review (check commit dates vs review dates)
32+
- **The author has responded to review feedback** and the review needs a re-review
33+
34+
If a review already exists and the author has not responded or pushed changes, the ball is in the author's court — skip that PR. Do not generate a prompt to re-review work that has already been reviewed.
35+
1536
## Step 1: Find Candidate Issues
1637

1738
Start by getting the landscape of open issues:

0 commit comments

Comments
 (0)