Skip to content

Commit 57156ee

Browse files
Merge remote-tracking branch 'upstream/main' into merge-cuda-hip
# Conflicts: # csrc/kernels.cu # csrc/kernels.hip # csrc/kernels_hip.cuh # csrc/ops.cu # csrc/ops.cuh # csrc/ops.hip # csrc/ops_hip.cuh # csrc/pythonInterface.cpp
2 parents f887942 + 96b37ec commit 57156ee

58 files changed

Lines changed: 68 additions & 4993 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/python-package.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ jobs:
8686
cuda: ${{ matrix.cuda_version }}
8787
method: "network"
8888
# The "crt" "nvvm" and "nvptxcompiler" components are added for CUDA 13.
89-
sub-packages: ${{ format('["nvcc"{0},"cudart","cusparse","cublas","thrust","cublas_dev","cusparse_dev"]', startsWith(matrix.cuda_version, '13.') && ',"crt","nvvm","nvptxcompiler"' || '') }}
89+
sub-packages: ${{ format('["nvcc"{0},"cudart","cublas","thrust","cublas_dev"]', startsWith(matrix.cuda_version, '13.') && ',"crt","nvvm","nvptxcompiler"' || '') }}
9090
use-github-cache: false
9191
use-local-cache: false
9292
log-file-suffix: ${{matrix.os}}-${{matrix.cuda_version}}.txt

.github/workflows/test-runner.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ jobs:
148148
with:
149149
cuda: ${{ inputs.cuda_version }}
150150
method: "network"
151-
sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]'
151+
sub-packages: '["nvcc","cudart","cublas","thrust","nvrtc_dev","cublas_dev"]'
152152
use-github-cache: false
153153

154154
# Windows: Setup MSVC (needed for both CPU and CUDA builds)

CMakeLists.txt

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,7 @@ endif()
348348

349349
if(BUILD_CUDA)
350350
target_include_directories(bitsandbytes PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
351-
target_link_libraries(bitsandbytes PUBLIC CUDA::cudart CUDA::cublas CUDA::cublasLt CUDA::cusparse)
351+
target_link_libraries(bitsandbytes PUBLIC CUDA::cudart CUDA::cublas CUDA::cublasLt)
352352
set_target_properties(bitsandbytes
353353
PROPERTIES
354354
CUDA_SEPARABLE_COMPILATION ON
@@ -368,7 +368,6 @@ if(BUILD_HIP)
368368
endmacro()
369369
find_package_and_print_version(hipblas REQUIRED)
370370
find_package_and_print_version(hiprand REQUIRED)
371-
find_package_and_print_version(hipsparse REQUIRED)
372371

373372
## hacky way of excluding hip::amdhip64 (with it linked many tests unexpectedly fail e.g. adam8bit because of inaccuracies)
374373
## On Windows, we need to link amdhip64 explicitly
@@ -380,7 +379,7 @@ if(BUILD_HIP)
380379

381380
target_include_directories(bitsandbytes PRIVATE ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/include ${ROCM_PATH}/include /include)
382381
target_link_directories(bitsandbytes PRIVATE ${ROCM_PATH}/lib /lib)
383-
target_link_libraries(bitsandbytes PUBLIC roc::hipblas hip::hiprand roc::hipsparse)
382+
target_link_libraries(bitsandbytes PUBLIC roc::hipblas hip::hiprand)
384383

385384
# On Windows, rocblas is not pulled in transitively by roc::hipblas
386385
# and is needed because ops_hip.cuh uses rocblas_handle directly.

agents/api_surface.md

Lines changed: 8 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -390,8 +390,7 @@ bitsandbytes.optim.optimizer.Optimizer8bit(params, defaults, optim_bits=32, is_p
390390
bitsandbytes.optim.optimizer.Optimizer2State(
391391
optimizer_name, params, lr=1e-3, betas=(0.9, 0.999),
392392
eps=1e-8, weight_decay=0.0, optim_bits=32, args=None,
393-
min_8bit_size=4096, percentile_clipping=100,
394-
block_wise=True, max_unorm=0.0, skip_zeros=False,
393+
min_8bit_size=4096, max_unorm=0.0, skip_zeros=False,
395394
is_paged=False, alpha=0.0, t_alpha=None, t_beta3=None,
396395
)
397396
```
@@ -405,8 +404,7 @@ bitsandbytes.optim.optimizer.Optimizer2State(
405404
bitsandbytes.optim.optimizer.Optimizer1State(
406405
optimizer_name, params, lr=1e-3, betas=(0.9, 0.0),
407406
eps=1e-8, weight_decay=0.0, optim_bits=32, args=None,
408-
min_8bit_size=4096, percentile_clipping=100,
409-
block_wise=True, max_unorm=0.0, skip_zeros=False,
407+
min_8bit_size=4096, max_unorm=0.0, skip_zeros=False,
410408
is_paged=False,
411409
)
412410
```
@@ -532,8 +530,6 @@ All bnb optimizers share these parameters beyond the standard PyTorch ones:
532530
|-----------|------|---------|-------------|
533531
| `optim_bits` | `int` | 32 | 32 for full precision state, 8 for quantized state |
534532
| `min_8bit_size` | `int` | 4096 | Parameters smaller than this use 32-bit state even in 8-bit mode |
535-
| `percentile_clipping` | `int` | 100 | Gradient clipping at a percentile. 100 = disabled |
536-
| `block_wise` | `bool` | `True` | Block-wise quantization of optimizer states (vs global) |
537533
| `max_unorm` | `float` | 0.0 | Maximum update norm relative to weight norm. 0 = disabled |
538534
| `skip_zeros` | `bool` | `False` | Skip zero gradients in sparse models |
539535
| `is_paged` | `bool` | `False` | Use CUDA managed memory for state offloading |
@@ -864,57 +860,7 @@ F.batched_igemm(
864860
Batched int8 matrix multiplication.
865861
**Stability:** Stable (internal).
866862

867-
### 4.9 Sparse Operations
868-
869-
#### `COOSparseTensor`
870-
871-
```python
872-
class F.COOSparseTensor:
873-
def __init__(self, rows, cols, nnz, rowidx, colidx, values): ...
874-
```
875-
876-
**Stability:** Legacy — used internally for sparse decomposition.
877-
878-
#### `CSRSparseTensor` / `CSCSparseTensor`
879-
880-
Similar sparse tensor containers.
881-
**Stability:** Legacy.
882-
883-
#### `coo_zeros`
884-
885-
```python
886-
F.coo_zeros(rows, cols, nnz, device, dtype=torch.half) -> COOSparseTensor
887-
```
888-
889-
#### `coo2csr` / `coo2csc`
890-
891-
```python
892-
F.coo2csr(cooA: COOSparseTensor) -> CSRSparseTensor
893-
F.coo2csc(cooA: COOSparseTensor) -> CSCSparseTensor
894-
```
895-
896-
#### `spmm_coo`
897-
898-
```python
899-
F.spmm_coo(
900-
cooA: COOSparseTensor, B: torch.Tensor,
901-
out: Optional[torch.Tensor] = None,
902-
) -> torch.Tensor
903-
```
904-
905-
Sparse matrix-dense matrix multiply using cusparse.
906-
**Stability:** Legacy.
907-
908-
#### `spmm_coo_very_sparse`
909-
910-
```python
911-
F.spmm_coo_very_sparse(cooA, B, dequant_stats=None, out=None) -> torch.Tensor
912-
```
913-
914-
Optimized for very sparse matrices with custom kernel.
915-
**Stability:** Legacy.
916-
917-
### 4.10 Paged Memory
863+
### 4.9 Paged Memory
918864

919865
#### `get_paged`
920866

@@ -934,7 +880,7 @@ F.prefetch_tensor(A: torch.Tensor, to_cpu: bool = False) -> None
934880
Prefetch a paged tensor to GPU or CPU.
935881
**Stability:** Stable (internal).
936882

937-
### 4.11 CPU-Specific Functions
883+
### 4.10 CPU-Specific Functions
938884

939885
#### `_convert_weight_packed_for_cpu`
940886

@@ -967,7 +913,7 @@ F.has_avx512bf16() -> bool
967913
Detects AVX512BF16 CPU support.
968914
**Stability:** Internal but may be useful externally.
969915

970-
### 4.12 Utility Functions
916+
### 4.11 Utility Functions
971917

972918
#### `is_on_gpu`
973919

@@ -987,7 +933,7 @@ F.get_ptr(A: Optional[Tensor]) -> Optional[ct.c_void_p]
987933
Gets the data pointer of a tensor for ctypes calls.
988934
**Stability:** Internal.
989935

990-
### 4.13 Singleton Managers
936+
### 4.12 Singleton Managers
991937

992938
#### `GlobalPageManager`
993939

@@ -1007,15 +953,6 @@ F.CUBLAS_Context.get_instance() -> CUBLAS_Context
1007953
Manages cuBLAS context handles per device.
1008954
**Stability:** Internal.
1009955

1010-
#### `Cusparse_Context`
1011-
1012-
```python
1013-
F.Cusparse_Context.get_instance() -> Cusparse_Context
1014-
```
1015-
1016-
Manages cusparse context handle.
1017-
**Stability:** Internal.
1018-
1019956
---
1020957

1021958
## 5. Autograd Functions
@@ -1238,7 +1175,7 @@ bitsandbytes.utils.replace_linear(
12381175
| Class | Description |
12391176
|-------|-------------|
12401177
| `BNBNativeLibrary` | Base wrapper for the ctypes-loaded native library |
1241-
| `CudaBNBNativeLibrary` | CUDA-specific subclass (sets up context/cusparse/managed ptr) |
1178+
| `CudaBNBNativeLibrary` | CUDA-specific subclass (sets up context/managed ptr) |
12421179
| `ErrorHandlerMockBNBNativeLibrary` | Fallback mock that defers error messages to call time |
12431180

12441181
### Module-level symbols
@@ -1313,7 +1250,6 @@ removed in a future release.
13131250
| `quantize_no_absmax` | `functional` | `quantize_blockwise` |
13141251
| `dequantize_no_absmax` | `functional` | `dequantize_blockwise` |
13151252
| `optimizer_update_8bit` | `functional` | `optimizer_update_8bit_blockwise` |
1316-
| `percentile_clipping` | `functional` | N/A (still used internally by non-blockwise path) |
13171253

13181254
---
13191255

@@ -1401,11 +1337,9 @@ A PR that changes any of these symbols MUST consider downstream impact:
14011337

14021338
- `bitsandbytes.cextension.*` (native library loading)
14031339
- `bitsandbytes.functional.get_ptr`, `is_on_gpu`, `_get_tensor_stream`
1404-
- `bitsandbytes.functional.GlobalPageManager`, `CUBLAS_Context`, `Cusparse_Context`
1340+
- `bitsandbytes.functional.GlobalPageManager`, `CUBLAS_Context`
14051341
- `bitsandbytes.functional._convert_weight_packed_for_cpu*`
14061342
- `bitsandbytes.functional.check_matmul`, `elementwise_func`, `fill`, `_mul`
1407-
- `bitsandbytes.functional.spmm_coo`, `spmm_coo_very_sparse`
1408-
- `bitsandbytes.functional.COOSparseTensor`, `CSRSparseTensor`, `CSCSparseTensor`
14091343
- `bitsandbytes.utils.pack_dict_to_tensor`, `unpack_tensor_to_dict`
14101344
- `bitsandbytes.utils.execute_and_return`, `sync_gpu`
14111345
- `bitsandbytes.optim.optimizer.MockArgs`

agents/architecture_guide.md

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -231,10 +231,6 @@ All ops are defined with the namespace `bitsandbytes::`:
231231
**Optimizer ops:**
232232
- `optimizer_update_32bit` — 32-bit optimizer step (Adam, Lion, SGD, etc.)
233233
- `optimizer_update_8bit_blockwise` — 8-bit blockwise optimizer step
234-
- `optimizer_update_8bit` — 8-bit non-blockwise optimizer step (legacy)
235-
236-
**Utility ops:**
237-
- `percentile_clipping` — adaptive gradient clipping by percentile
238234

239235
---
240236

@@ -745,10 +741,8 @@ The base class `Optimizer2State.update_step()` then dispatches based on state dt
745741
def update_step(self, group, p, gindex, pindex):
746742
if state["state1"].dtype == torch.float:
747743
F.optimizer_update_32bit(self.optimizer_name, grad, p, state1, ...)
748-
elif state["state1"].dtype == torch.uint8 and config["block_wise"]:
744+
elif state["state1"].dtype == torch.uint8:
749745
F.optimizer_update_8bit_blockwise(self.optimizer_name, grad, p, state1, ...)
750-
elif state["state1"].dtype == torch.uint8 and not config["block_wise"]:
751-
F.optimizer_update_8bit(self.optimizer_name, grad, p, state1, ...)
752746
```
753747

754748
### Optimizer state initialization
@@ -968,8 +962,8 @@ The `COMPUTE_BACKEND` CMake variable selects the target:
968962
| Backend | Library name | Languages | Dependencies |
969963
|---|---|---|---|
970964
| `cpu` | `libbitsandbytes_cpu.so` | C++17 | OpenMP (optional) |
971-
| `cuda` | `libbitsandbytes_cuda{VER}.so` | C++17 + CUDA | cudart, cublas, cublasLt, cusparse |
972-
| `hip` | `libbitsandbytes_rocm{VER}.so` | C++17 + HIP | hipblas, hiprand, hipsparse |
965+
| `cuda` | `libbitsandbytes_cuda{VER}.so` | C++17 + CUDA | cudart, cublas, cublasLt |
966+
| `hip` | `libbitsandbytes_rocm{VER}.so` | C++17 + HIP | hipblas, hiprand |
973967
| `mps` | `libbitsandbytes_mps.dylib` | C++17 + ObjC++ | Metal framework |
974968
| `xpu` | `libbitsandbytes_xpu.so` | C++20 + SYCL | Intel oneAPI |
975969
@@ -1080,7 +1074,7 @@ Optimizer8bit.step():
10801074
├── p.data = p.data.contiguous()
10811075
├── config = self.get_config(gindex, pindex, group)
10821076
1083-
├── state["state1"].dtype == uint8 and block_wise:
1077+
├── state["state1"].dtype == uint8:
10841078
│ F.optimizer_update_8bit_blockwise("adam", grad, p, state1, state2,
10851079
│ beta1, beta2, ..., qmap1, qmap2, absmax1, absmax2, ...)
10861080
│ ↓

agents/code_standards.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ class GlobalOptimManager:
152152
```
153153

154154
This pattern is used by: `GlobalOptimManager`, `GlobalPageManager`, `CUBLAS_Context`,
155-
`Cusparse_Context`, `GlobalOutlierPooler`, `OutlierTracer`.
155+
`GlobalOutlierPooler`, `OutlierTracer`.
156156

157157
---
158158

@@ -867,7 +867,6 @@ Use the project's error checking macros:
867867
868868
```cpp
869869
CUDA_CHECK_RETURN(cudaMemcpy(...));
870-
CHECK_CUSPARSE(cusparseCreate(...));
871870
```
872871

873872
The `checkCublasStatus` function returns an error code rather than throwing — the Python

agents/issue_patterns.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@ These are the single largest category of issues. Most are environment problems o
3434
>
3535
> If you're still hitting problems on the **latest** bitsandbytes (v0.45+), please open a new issue with the output of `python -m bitsandbytes` and your environment details.
3636
37-
### Missing `libcusparse.so.11` / shared library mismatch
37+
### Missing shared CUDA library / shared library mismatch
3838

39-
**How to identify:** `OSError: libcusparse.so.11: cannot open shared object file: No such file or directory`. Or similar errors for `libcusparse.so.12`, `libcublasLt.so.11`, etc.
39+
**How to identify:** `OSError: libcublasLt.so.11: cannot open shared object file: No such file or directory`. Or similar errors for `libcudart`, `libcublas`, etc.
4040

4141
**What happened:** The bnb binary was compiled against one CUDA version (e.g., 11.x) but the system only has another (e.g., 12.x). The shared library dependencies don't exist. Modern releases ship platform-specific wheels with better CUDA version detection and multiple binary variants.
4242

agents/security_guide.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,6 @@ bitsandbytes/autograd/_functions.py:
445445
```
446446
bitsandbytes/functional.py:
447447
- optimizer_update_8bit_blockwise() — 8-bit optimizer step
448-
- percentile_clipping() — gradient clipping for optimizer stability
449448
450449
csrc/ops.cu / kernels.cu:
451450
- Optimizer kernel implementations

benchmarking/switchback/README.md

Lines changed: 0 additions & 4 deletions
This file was deleted.

0 commit comments

Comments
 (0)