bitsandbytes-foundation
diff --git a/‎.github/workflows/python-package.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/python-package.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/test-runner.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test-runner.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 2 additions & 3 deletions b/‎CMakeLists.txt‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎agents/api_surface.md‎
Lines changed: 8 additions & 74 deletions b/‎agents/api_surface.md‎
Lines changed: 8 additions & 74 deletions
diff --git a/‎agents/architecture_guide.md‎
Lines changed: 4 additions & 10 deletions b/‎agents/architecture_guide.md‎
Lines changed: 4 additions & 10 deletions
diff --git a/‎agents/code_standards.md‎
Lines changed: 1 addition & 2 deletions b/‎agents/code_standards.md‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎agents/issue_patterns.md‎
Lines changed: 2 additions & 2 deletions b/‎agents/issue_patterns.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎agents/security_guide.md‎
Lines changed: 0 additions & 1 deletion b/‎agents/security_guide.md‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎benchmarking/switchback/README.md‎
Lines changed: 0 additions & 4 deletions b/‎benchmarking/switchback/README.md‎
Lines changed: 0 additions & 4 deletions
@@ -86,7 +86,7 @@ jobs:
           cuda: ${{ matrix.cuda_version }}
           method: "network"
           # The "crt" "nvvm" and "nvptxcompiler" components are added for CUDA 13.
-          sub-packages: ${{ format('["nvcc"{0},"cudart","cusparse","cublas","thrust","cublas_dev","cusparse_dev"]', startsWith(matrix.cuda_version, '13.') && ',"crt","nvvm","nvptxcompiler"' || '') }}
+          sub-packages: ${{ format('["nvcc"{0},"cudart","cublas","thrust","cublas_dev"]', startsWith(matrix.cuda_version, '13.') && ',"crt","nvvm","nvptxcompiler"' || '') }}
           use-github-cache: false
           use-local-cache: false
           log-file-suffix: ${{matrix.os}}-${{matrix.cuda_version}}.txt
 
@@ -148,7 +148,7 @@ jobs:
         with:
           cuda: ${{ inputs.cuda_version }}
           method: "network"
-          sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]'
+          sub-packages: '["nvcc","cudart","cublas","thrust","nvrtc_dev","cublas_dev"]'
           use-github-cache: false
 
       # Windows: Setup MSVC (needed for both CPU and CUDA builds)
 
@@ -348,7 +348,7 @@ endif()
 
 if(BUILD_CUDA)
     target_include_directories(bitsandbytes PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
-    target_link_libraries(bitsandbytes PUBLIC CUDA::cudart CUDA::cublas CUDA::cublasLt CUDA::cusparse)
+    target_link_libraries(bitsandbytes PUBLIC CUDA::cudart CUDA::cublas CUDA::cublasLt)
     set_target_properties(bitsandbytes
         PROPERTIES
             CUDA_SEPARABLE_COMPILATION ON
@@ -368,7 +368,6 @@ if(BUILD_HIP)
     endmacro()
     find_package_and_print_version(hipblas REQUIRED)
     find_package_and_print_version(hiprand REQUIRED)
-    find_package_and_print_version(hipsparse REQUIRED)
 
     ## hacky way of excluding hip::amdhip64 (with it linked many tests unexpectedly fail e.g. adam8bit because of inaccuracies)
     ## On Windows, we need to link amdhip64 explicitly
@@ -380,7 +379,7 @@ if(BUILD_HIP)
 
     target_include_directories(bitsandbytes PRIVATE ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/include ${ROCM_PATH}/include /include)
     target_link_directories(bitsandbytes PRIVATE ${ROCM_PATH}/lib /lib)
-    target_link_libraries(bitsandbytes PUBLIC roc::hipblas hip::hiprand roc::hipsparse)
+    target_link_libraries(bitsandbytes PUBLIC roc::hipblas hip::hiprand)
 
     # On Windows, rocblas is not pulled in transitively by roc::hipblas
     # and is needed because ops_hip.cuh uses rocblas_handle directly.
 
@@ -390,8 +390,7 @@ bitsandbytes.optim.optimizer.Optimizer8bit(params, defaults, optim_bits=32, is_p
 bitsandbytes.optim.optimizer.Optimizer2State(
     optimizer_name, params, lr=1e-3, betas=(0.9, 0.999),
     eps=1e-8, weight_decay=0.0, optim_bits=32, args=None,
-    min_8bit_size=4096, percentile_clipping=100,
-    block_wise=True, max_unorm=0.0, skip_zeros=False,
+    min_8bit_size=4096, max_unorm=0.0, skip_zeros=False,
     is_paged=False, alpha=0.0, t_alpha=None, t_beta3=None,
 )
 ```
@@ -405,8 +404,7 @@ bitsandbytes.optim.optimizer.Optimizer2State(
 bitsandbytes.optim.optimizer.Optimizer1State(
     optimizer_name, params, lr=1e-3, betas=(0.9, 0.0),
     eps=1e-8, weight_decay=0.0, optim_bits=32, args=None,
-    min_8bit_size=4096, percentile_clipping=100,
-    block_wise=True, max_unorm=0.0, skip_zeros=False,
+    min_8bit_size=4096, max_unorm=0.0, skip_zeros=False,
     is_paged=False,
 )
 ```
@@ -532,8 +530,6 @@ All bnb optimizers share these parameters beyond the standard PyTorch ones:
 |-----------|------|---------|-------------|
 | `optim_bits` | `int` | 32 | 32 for full precision state, 8 for quantized state |
 | `min_8bit_size` | `int` | 4096 | Parameters smaller than this use 32-bit state even in 8-bit mode |
-| `percentile_clipping` | `int` | 100 | Gradient clipping at a percentile. 100 = disabled |
-| `block_wise` | `bool` | `True` | Block-wise quantization of optimizer states (vs global) |
 | `max_unorm` | `float` | 0.0 | Maximum update norm relative to weight norm. 0 = disabled |
 | `skip_zeros` | `bool` | `False` | Skip zero gradients in sparse models |
 | `is_paged` | `bool` | `False` | Use CUDA managed memory for state offloading |
@@ -864,57 +860,7 @@ F.batched_igemm(
 Batched int8 matrix multiplication.
 **Stability:** Stable (internal).
 
-### 4.9 Sparse Operations
-
-#### `COOSparseTensor`
-
-```python
-class F.COOSparseTensor:
-    def __init__(self, rows, cols, nnz, rowidx, colidx, values): ...
-```
-
-**Stability:** Legacy — used internally for sparse decomposition.
-
-#### `CSRSparseTensor` / `CSCSparseTensor`
-
-Similar sparse tensor containers.
-**Stability:** Legacy.
-
-#### `coo_zeros`
-
-```python
-F.coo_zeros(rows, cols, nnz, device, dtype=torch.half) -> COOSparseTensor
-```
-
-#### `coo2csr` / `coo2csc`
-
-```python
-F.coo2csr(cooA: COOSparseTensor) -> CSRSparseTensor
-F.coo2csc(cooA: COOSparseTensor) -> CSCSparseTensor
-```
-
-#### `spmm_coo`
-
-```python
-F.spmm_coo(
-    cooA: COOSparseTensor, B: torch.Tensor,
-    out: Optional[torch.Tensor] = None,
-) -> torch.Tensor
-```
-
-Sparse matrix-dense matrix multiply using cusparse.
-**Stability:** Legacy.
-
-#### `spmm_coo_very_sparse`
-
-```python
-F.spmm_coo_very_sparse(cooA, B, dequant_stats=None, out=None) -> torch.Tensor
-```
-
-Optimized for very sparse matrices with custom kernel.
-**Stability:** Legacy.
-
-### 4.10 Paged Memory
+### 4.9 Paged Memory
 
 #### `get_paged`
 
@@ -934,7 +880,7 @@ F.prefetch_tensor(A: torch.Tensor, to_cpu: bool = False) -> None
 Prefetch a paged tensor to GPU or CPU.
 **Stability:** Stable (internal).
 
-### 4.11 CPU-Specific Functions
+### 4.10 CPU-Specific Functions
 
 #### `_convert_weight_packed_for_cpu`
 
@@ -967,7 +913,7 @@ F.has_avx512bf16() -> bool
 Detects AVX512BF16 CPU support.
 **Stability:** Internal but may be useful externally.
 
-### 4.12 Utility Functions
+### 4.11 Utility Functions
 
 #### `is_on_gpu`
 
@@ -987,7 +933,7 @@ F.get_ptr(A: Optional[Tensor]) -> Optional[ct.c_void_p]
 Gets the data pointer of a tensor for ctypes calls.
 **Stability:** Internal.
 
-### 4.13 Singleton Managers
+### 4.12 Singleton Managers
 
 #### `GlobalPageManager`
 
@@ -1007,15 +953,6 @@ F.CUBLAS_Context.get_instance() -> CUBLAS_Context
 Manages cuBLAS context handles per device.
 **Stability:** Internal.
 
-#### `Cusparse_Context`
-
-```python
-F.Cusparse_Context.get_instance() -> Cusparse_Context
-```
-
-Manages cusparse context handle.
-**Stability:** Internal.
-
 ---
 
 ## 5. Autograd Functions
@@ -1238,7 +1175,7 @@ bitsandbytes.utils.replace_linear(
 | Class | Description |
 |-------|-------------|
 | `BNBNativeLibrary` | Base wrapper for the ctypes-loaded native library |
-| `CudaBNBNativeLibrary` | CUDA-specific subclass (sets up context/cusparse/managed ptr) |
+| `CudaBNBNativeLibrary` | CUDA-specific subclass (sets up context/managed ptr) |
 | `ErrorHandlerMockBNBNativeLibrary` | Fallback mock that defers error messages to call time |
 
 ### Module-level symbols
@@ -1313,7 +1250,6 @@ removed in a future release.
 | `quantize_no_absmax` | `functional` | `quantize_blockwise` |
 | `dequantize_no_absmax` | `functional` | `dequantize_blockwise` |
 | `optimizer_update_8bit` | `functional` | `optimizer_update_8bit_blockwise` |
-| `percentile_clipping` | `functional` | N/A (still used internally by non-blockwise path) |
 
 ---
 
@@ -1401,11 +1337,9 @@ A PR that changes any of these symbols MUST consider downstream impact:
 
 - `bitsandbytes.cextension.*` (native library loading)
 - `bitsandbytes.functional.get_ptr`, `is_on_gpu`, `_get_tensor_stream`
-- `bitsandbytes.functional.GlobalPageManager`, `CUBLAS_Context`, `Cusparse_Context`
+- `bitsandbytes.functional.GlobalPageManager`, `CUBLAS_Context`
 - `bitsandbytes.functional._convert_weight_packed_for_cpu*`
 - `bitsandbytes.functional.check_matmul`, `elementwise_func`, `fill`, `_mul`
-- `bitsandbytes.functional.spmm_coo`, `spmm_coo_very_sparse`
-- `bitsandbytes.functional.COOSparseTensor`, `CSRSparseTensor`, `CSCSparseTensor`
 - `bitsandbytes.utils.pack_dict_to_tensor`, `unpack_tensor_to_dict`
 - `bitsandbytes.utils.execute_and_return`, `sync_gpu`
 - `bitsandbytes.optim.optimizer.MockArgs`
 
@@ -231,10 +231,6 @@ All ops are defined with the namespace `bitsandbytes::`:
 **Optimizer ops:**
 - `optimizer_update_32bit` — 32-bit optimizer step (Adam, Lion, SGD, etc.)
 - `optimizer_update_8bit_blockwise` — 8-bit blockwise optimizer step
-- `optimizer_update_8bit` — 8-bit non-blockwise optimizer step (legacy)
-
-**Utility ops:**
-- `percentile_clipping` — adaptive gradient clipping by percentile
 
 ---
 
@@ -745,10 +741,8 @@ The base class `Optimizer2State.update_step()` then dispatches based on state dt
 def update_step(self, group, p, gindex, pindex):
     if state["state1"].dtype == torch.float:
         F.optimizer_update_32bit(self.optimizer_name, grad, p, state1, ...)
-    elif state["state1"].dtype == torch.uint8 and config["block_wise"]:
+    elif state["state1"].dtype == torch.uint8:
         F.optimizer_update_8bit_blockwise(self.optimizer_name, grad, p, state1, ...)
-    elif state["state1"].dtype == torch.uint8 and not config["block_wise"]:
-        F.optimizer_update_8bit(self.optimizer_name, grad, p, state1, ...)
 ```
 
 ### Optimizer state initialization
@@ -968,8 +962,8 @@ The `COMPUTE_BACKEND` CMake variable selects the target:
 | Backend | Library name | Languages | Dependencies |
 |---|---|---|---|
 | `cpu` | `libbitsandbytes_cpu.so` | C++17 | OpenMP (optional) |
-| `cuda` | `libbitsandbytes_cuda{VER}.so` | C++17 + CUDA | cudart, cublas, cublasLt, cusparse |
-| `hip` | `libbitsandbytes_rocm{VER}.so` | C++17 + HIP | hipblas, hiprand, hipsparse |
+| `cuda` | `libbitsandbytes_cuda{VER}.so` | C++17 + CUDA | cudart, cublas, cublasLt |
+| `hip` | `libbitsandbytes_rocm{VER}.so` | C++17 + HIP | hipblas, hiprand |
 | `mps` | `libbitsandbytes_mps.dylib` | C++17 + ObjC++ | Metal framework |
 | `xpu` | `libbitsandbytes_xpu.so` | C++20 + SYCL | Intel oneAPI |
 
@@ -1080,7 +1074,7 @@ Optimizer8bit.step():
       ├── p.data = p.data.contiguous()
       ├── config = self.get_config(gindex, pindex, group)
       │
-      ├── state["state1"].dtype == uint8 and block_wise:
+      ├── state["state1"].dtype == uint8:
       │   F.optimizer_update_8bit_blockwise("adam", grad, p, state1, state2,
       │       beta1, beta2, ..., qmap1, qmap2, absmax1, absmax2, ...)
       │     ↓
 
@@ -152,7 +152,7 @@ class GlobalOptimManager:
 ```
 
 This pattern is used by: `GlobalOptimManager`, `GlobalPageManager`, `CUBLAS_Context`,
-`Cusparse_Context`, `GlobalOutlierPooler`, `OutlierTracer`.
+`GlobalOutlierPooler`, `OutlierTracer`.
 
 ---
 
@@ -867,7 +867,6 @@ Use the project's error checking macros:
 
 ```cpp
 CUDA_CHECK_RETURN(cudaMemcpy(...));
-CHECK_CUSPARSE(cusparseCreate(...));
 ```
 
 The `checkCublasStatus` function returns an error code rather than throwing — the Python
 
@@ -34,9 +34,9 @@ These are the single largest category of issues. Most are environment problems o
 >
 > If you're still hitting problems on the **latest** bitsandbytes (v0.45+), please open a new issue with the output of `python -m bitsandbytes` and your environment details.
 
-### Missing `libcusparse.so.11` / shared library mismatch
+### Missing shared CUDA library / shared library mismatch
 
-**How to identify:** `OSError: libcusparse.so.11: cannot open shared object file: No such file or directory`. Or similar errors for `libcusparse.so.12`, `libcublasLt.so.11`, etc.
+**How to identify:** `OSError: libcublasLt.so.11: cannot open shared object file: No such file or directory`. Or similar errors for `libcudart`, `libcublas`, etc.
 
 **What happened:** The bnb binary was compiled against one CUDA version (e.g., 11.x) but the system only has another (e.g., 12.x). The shared library dependencies don't exist. Modern releases ship platform-specific wheels with better CUDA version detection and multiple binary variants.
 
 
@@ -445,7 +445,6 @@ bitsandbytes/autograd/_functions.py:
 ```
 bitsandbytes/functional.py:
   - optimizer_update_8bit_blockwise() — 8-bit optimizer step
-  - percentile_clipping() — gradient clipping for optimizer stability
 
 csrc/ops.cu / kernels.cu:
   - Optimizer kernel implementations
Original file line number	Diff line number	Diff line change
`@@ -34,9 +34,9 @@ These are the single largest category of issues. Most are environment problems o`
`34`	`34`	`>`
`35`	`35`	> If you're still hitting problems on the latest bitsandbytes (v0.45+), please open a new issue with the output of `python -m bitsandbytes` and your environment details.
`36`	`36`
`37`		-### Missing `libcusparse.so.11` / shared library mismatch
	`37`	`+### Missing shared CUDA library / shared library mismatch`
`38`	`38`
`39`		-How to identify: `OSError: libcusparse.so.11: cannot open shared object file: No such file or directory`. Or similar errors for `libcusparse.so.12`, `libcublasLt.so.11`, etc.
	`39`	+How to identify: `OSError: libcublasLt.so.11: cannot open shared object file: No such file or directory`. Or similar errors for `libcudart`, `libcublas`, etc.
`40`	`40`
`41`	`41`	`What happened: The bnb binary was compiled against one CUDA version (e.g., 11.x) but the system only has another (e.g., 12.x). The shared library dependencies don't exist. Modern releases ship platform-specific wheels with better CUDA version detection and multiple binary variants.`
`42`	`42`