fix

Abdennacer-Badaoui · Abdennacer-Badaoui · commit c6f2575d6f85 · 2026-04-29T09:44:11.000Z
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
@@ -74,9 +74,20 @@ def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor
 
     if has_error:
         if has_error == 100:
-            # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`
-            # TODO: Warn and implement a fallback to fp32 compute?
-            raise NotImplementedError("int8_linear_matmul not implemented!")
+            # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`. The HIP backend
+            # also returns this when no usable hipBLASLt algo exists for the shape
+            # (seen on MI300X for some small-n int8 gemms). Fall back to fp32 — same
+            # path used for the `lda % 4 != 0` case above.
+            import warnings
+
+            warnings.warn(
+                f"int8_linear_matmul has no usable (hip|cu)blasLt algo for shape "
+                f"{shapeA=} {shapeB=}; falling back to fp32 matmul.",
+                RuntimeWarning,
+                stacklevel=2,
+            )
+            result = torch.matmul(B.float(), A.float().t()).to(torch.int32)
+            return out.copy_(result)
         else:
             raise RuntimeError(
                 f"cublasLt ran into an error!\n\t{shapeA=}, {shapeB=}, {shapeC=}\n\t{(lda, ldb, ldc)=}\n\t{(m, n, k)=}"
diff --git a/csrc/ops.cu b/csrc/ops.cu
@@ -327,7 +327,10 @@ int igemmlt(
             bnb_blasLtPrefSetAttr(pref, BNB_BLASLT_PREF_MAX_WORKSPACE, &max_workspace_size, sizeof(max_workspace_size))
         );
 
-        const int request_solutions = 1;
+        // hipBLASLt's first heuristic algo can be unusable for small-n int8 gemms
+        // (e.g. n=4 on MI300X) and fails at matmul time with INVALID_VALUE. Request
+        // several candidates and use the first one that actually runs successfully.
+        const int request_solutions = 8;
         bnb_blasLt_heuristic_t heuristicResult[request_solutions];
         int returnedAlgoCount = 0;
         checkBlasLtStatus(bnb_blasLtAlgoGetHeuristic(
@@ -340,10 +343,24 @@ int igemmlt(
             fprintf(stderr, "Error: Matmul Algo Heuristic didn't return algorithms\n");
         } else {
             int alpha = 1, beta = 0;
-            has_error |= checkBlasLtStatus(bnb_blasLtMatmul(
-                ltHandle, matmulDesc, &alpha, A, aDesc, B, bDesc, &beta, (int32_t*)C, cDesc, (int32_t*)C, cDesc,
-                &heuristicResult[0].algo, NULL, 0, stream
-            ));
+            bnb_blas_status_t matmul_status = BNB_BLAS_STATUS_SUCCESS;
+            for (int i = 0; i < returnedAlgoCount; ++i) {
+                matmul_status = bnb_blasLtMatmul(
+                    ltHandle, matmulDesc, &alpha, A, aDesc, B, bDesc, &beta, (int32_t*)C, cDesc, (int32_t*)C, cDesc,
+                    &heuristicResult[i].algo, NULL, 0, stream
+                );
+                if (matmul_status == BNB_BLAS_STATUS_SUCCESS)
+                    break;
+            }
+            if (matmul_status != BNB_BLAS_STATUS_SUCCESS) {
+                // Every workspace-free algo hipBLASLt offered failed at runtime
+                // (seen on MI300X for some small-n int8 gemms). Drain the HIP
+                // last-error flag the failed launches set, otherwise the next
+                // unrelated HIP call will inherit it. Then signal the Python
+                // wrapper to take the fp32 fallback via ERR_NOT_IMPLEMENTED.
+                (void)hipGetLastError();
+                return ERR_NOT_IMPLEMENTED;
+            }
         }
 #else
         int alpha = 1, beta = 0;