fix when A.numel() not divisibel by blocksize

Abdennacer-Badaoui · Abdennacer-Badaoui · commit 1c96a34e7d7c · 2026-03-03T12:05:13.000Z
diff --git a/bitsandbytes/backends/triton/ops.py b/bitsandbytes/backends/triton/ops.py
@@ -76,8 +76,12 @@ def quantize_4bit(
 
     n = A.numel()
 
-    # TODO: Support when weight matrix is not divisible by blocksize
-    # torch._check(n % blocksize == 0, lambda: f"n must be divisible by blocksize, got {n} and {blocksize}")
+    # Pad to next multiple of blocksize so the kernel always processes full blocks
+    remainder = n % blocksize
+    if remainder != 0:
+        padding = blocksize - remainder
+        A = torch.nn.functional.pad(A.view(-1), (0, padding), value=0.0)
+        n = A.numel()
 
     blocks = -(n // -(blocksize * 2))
 
diff --git a/tests/test_ops.py b/tests/test_ops.py
@@ -172,6 +172,32 @@ def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize
 
         opcheck(torch.ops.bitsandbytes.quantize_4bit.default, (A, blocksize, quant_type, storage_dtype))
 
+    @pytest.mark.parametrize("device", get_available_devices())
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
+    @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
+    @pytest.mark.parametrize("blocksize", [64, 128, 256])
+    def test_quantize_4bit_not_divisible_by_blocksize(self, device, dtype, quant_type, blocksize):
+        """Test quantize/dequantize roundtrip when n_elements is not divisible by blocksize."""
+        # Shape chosen so numel is NOT divisible by blocksize
+        shape = (7, blocksize - 1)
+        A = torch.randn(shape, dtype=dtype, device=device)
+        storage_dtype = torch.uint8
+
+        # Should not raise
+        packed, absmax = torch.ops.bitsandbytes.quantize_4bit(A, blocksize, quant_type, storage_dtype)
+
+        assert packed.device == A.device
+        assert absmax.device == A.device
+
+        # Dequantize back and verify shape is preserved
+        out = torch.ops.bitsandbytes.dequantize_4bit(packed, absmax, blocksize, quant_type, shape, dtype)
+
+        assert out.shape == shape
+        assert out.dtype == dtype
+
+        # Verify output is finite (no NaN/Inf)
+        assert torch.isfinite(out).all(), "Dequantized output contains NaN or Inf"
+
     @pytest.mark.parametrize("device", get_available_devices())
     @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
     @pytest.mark.parametrize("storage_dtype", [torch.uint8, torch.bfloat16], ids=id_formatter("storage_dtype"))