Testing cleanup

matthewdouglas · matthewdouglas · commit 5102319dadec · 2025-03-26T13:46:42.000-04:00
diff --git a/benchmarking/optimizer_benchmark.py b/benchmarking/optimizer_benchmark.py
@@ -0,0 +1,56 @@
+"""
+Extracted from tests/test_optim.py
+
+Usage: pytest benchmarking/optimizer_benchmark.py
+"""
+
+import time
+
+import pytest
+from tests.helpers import describe_dtype, id_formatter
+import torch
+
+import bitsandbytes as bnb
+
+str2optimizers = {"paged_adamw": (torch.optim.AdamW, bnb.optim.PagedAdamW)}
+
+
+@pytest.mark.parametrize("dim1", [2 * 1024], ids=id_formatter("dim1"))
+@pytest.mark.parametrize("gtype", [torch.float16], ids=describe_dtype)
+@pytest.mark.parametrize("optim_name", ["paged_adamw"], ids=id_formatter("optim_name"))
+@pytest.mark.parametrize("mode", ["bnb"], ids=id_formatter("mode"))
+@pytest.mark.benchmark
+def test_stream_optimizer_bench(dim1, gtype, optim_name, mode):
+    layers1 = torch.nn.Sequential(*torch.nn.ModuleList([torch.nn.Linear(dim1, dim1) for i in range(10)]))
+    layers1 = layers1.to(gtype)
+    layers1 = layers1.cuda()
+
+    large_tensor = None
+    if mode == "torch":
+        optim = str2optimizers[optim_name][0](layers1.parameters())
+    else:
+        optim = str2optimizers[optim_name][1](layers1.parameters())
+        # 12 GB
+        large_tensor = torch.empty((int(4.5e9),), device="cuda")
+
+    torch.cuda.synchronize()
+    time.sleep(5)
+
+    num_batches = 5
+    batches = torch.randn(num_batches, 128, dim1, device="cuda").to(gtype)
+    lbls = torch.randint(0, 10, size=(num_batches, 128)).cuda()
+
+    for i in range(num_batches):
+        print(i)
+        b = batches[i]
+        if i == 2:
+            torch.cuda.synchronize()
+            t0 = time.time()
+
+        out1 = layers1(b)
+
+        loss1 = torch.nn.functional.cross_entropy(out1, lbls[i]).mean()
+        loss1.backward()
+        optim.step()
+    torch.cuda.synchronize()
+    print(mode, time.time() - t0)
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,11 +1,22 @@
 import gc
+import random
 
+import numpy as np
 import pytest
 import torch
 
 
+def _set_seed():
+    torch.manual_seed(0)
+    torch.cuda.manual_seed_all(0)
+    torch.mps.manual_seed(0)
+    np.random.seed(0)
+    random.seed(0)
+
+
 def pytest_runtest_call(item):
     try:
+        _set_seed()
         item.runtest()
     except AssertionError as ae:
         if str(ae) == "Torch not compiled with CUDA enabled":
diff --git a/tests/test_autograd.py b/tests/test_autograd.py
@@ -6,7 +6,6 @@
     BOOLEAN_TRIPLES,
     TRUE_FALSE,
     describe_dtype,
-    get_test_dims,
     id_formatter,
 )
 
@@ -136,10 +135,10 @@ def test_matmullt(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, dec
                     torch.testing.assert_close(gradBias1, gradBias2)
 
 
-@pytest.mark.parametrize("dim1", get_test_dims(16, 64, n=1), ids=id_formatter("dim1"))
-@pytest.mark.parametrize("dim2", [*get_test_dims(32, 96, n=1), 0], ids=id_formatter("dim2"))
-@pytest.mark.parametrize("dim3", get_test_dims(32, 96, n=1), ids=id_formatter("dim3"))
-@pytest.mark.parametrize("dim4", get_test_dims(32, 96, n=1), ids=id_formatter("dim4"))
+@pytest.mark.parametrize("dim1", [48], ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", [64, 0], ids=id_formatter("dim2"))
+@pytest.mark.parametrize("dim3", [64], ids=id_formatter("dim3"))
+@pytest.mark.parametrize("dim4", [96], ids=id_formatter("dim4"))
 @pytest.mark.parametrize("funcs", [(torch.matmul, bnb.matmul_4bit)], ids=["func=matmul"])
 @pytest.mark.parametrize("req_grad", BOOLEAN_TRIPLES, ids=id_formatter("req_grad"))
 @pytest.mark.parametrize("transpose", TRANSPOSE_VALS, ids=id_formatter("transpose"))
@@ -231,85 +230,3 @@ def test_matmul_4bit(
 
                 if req_grad[2]:
                     torch.testing.assert_close(gradBias1, gradBias2)
-
-
-@pytest.mark.parametrize("dim1", get_test_dims(16, 64, n=1), ids=id_formatter("dim1"))
-@pytest.mark.parametrize("dim2", [*get_test_dims(32, 96, n=1), 0], ids=id_formatter("dim2"))
-@pytest.mark.parametrize("dim3", get_test_dims(32, 96, n=1), ids=id_formatter("dim3"))
-@pytest.mark.parametrize("dim4", get_test_dims(32, 96, n=1), ids=id_formatter("dim4"))
-@pytest.mark.parametrize("req_grad", BOOLEAN_TRIPLES, ids=id_formatter("req_grad"))
-@pytest.mark.parametrize("transpose", TRANSPOSE_VALS, ids=id_formatter("transpose"))
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=describe_dtype)
-@pytest.mark.parametrize(
-    "funcs",
-    [(torch.matmul, bnb.research.matmul_fp8_mixed), (torch.matmul, bnb.research.matmul_fp8_global)],
-    ids=["matmul_fp8_mixed", "matmul_fp8_global"],
-)
-def test_matmul_fp8(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose):
-    dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2)
-    dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3)
-    req_grad = list(req_grad)
-    req_grad[2] = False
-
-    for i in range(3):
-        # normal multiply
-        if funcs[0] in [torch.mm, torch.matmul]:
-            A = torch.randn(size=dimA, device="cuda", requires_grad=req_grad[0], dtype=dtype)
-            B = torch.randn(size=dimB, device="cuda", requires_grad=req_grad[1], dtype=dtype)
-            target = torch.randn(size=(dim2, dim4), device="cuda", requires_grad=req_grad[1], dtype=dtype)
-
-            torch.nn.init.xavier_uniform_(B)
-
-            fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(A.device)
-            bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(A.device)
-
-            if not transpose[0] and transpose[1]:
-                out_torch = funcs[0](A, B.t())
-                out_bnb = funcs[1](A, B.t(), fw_code, bw_code)
-            elif not transpose[0] and not transpose[1]:
-                out_torch = funcs[0](A, B)
-                out_bnb = funcs[1](A, B, fw_code, bw_code)
-
-            assert out_bnb.dtype == A.dtype, f"bnb matmullt received {A.dtype} but returned {out_bnb.dtype}"
-
-            n = out_bnb.numel()
-            err = torch.abs(out_bnb - out_torch).float().mean().item()
-            if n > 0:
-                assert err < 0.115
-                # assert err < 0.20
-            if any(req_grad):
-                out_bnb.data.copy_(out_torch)
-                torch.cuda.synchronize()
-                loss_bnb = torch.nn.functional.mse_loss(out_bnb, target).mean()
-                loss_bnb.backward()
-                gradA1 = A.grad
-                gradB1 = B.grad
-                A.grad = None
-                B.grad = None
-
-                loss_torch = torch.nn.functional.mse_loss(out_torch, target).mean()
-                loss_torch.backward()
-                gradA2 = A.grad
-                gradB2 = B.grad
-                A.grad = None
-                B.grad = None
-
-                if req_grad[0]:
-                    torch.testing.assert_close(gradA1, gradA2, atol=0.015, rtol=0.1)
-
-                if req_grad[1]:
-                    n = gradB1.numel()
-                    if dim2 > 0:
-                        assert torch.abs(gradB1).sum() > 0.0
-                        assert torch.abs(gradB2).sum() > 0.0
-                    else:
-                        assert torch.abs(gradB1).sum() == 0.0
-                        assert torch.abs(gradB2).sum() == 0.0
-                    idx = torch.isclose(gradB1, gradB2, atol=0.06, rtol=0.3)
-
-                    assert (idx == 0).sum().item() <= n * 0.1
-                    idx = torch.isclose(gradB1, gradB2, atol=0.10, rtol=0.3)
-                    assert (idx == 0).sum().item() <= n * 0.02
-                    grad_err = (gradB1 - gradB2).abs().mean()
-                    assert grad_err.item() < 0.003
-                    torch.testing.assert_close(gradB1, gradB2, atol=0.18, rtol=0.3)
diff --git a/tests/test_deprecated.py b/tests/test_deprecated.py
@@ -3,7 +3,10 @@
 from scipy.stats import norm
 import torch
 
+import bitsandbytes as bnb
 from bitsandbytes import functional as F
+from tests.helpers import BOOLEAN_TRIPLES, describe_dtype, get_test_dims, id_formatter
+from tests.test_autograd import TRANSPOSE_VALS
 
 
 @pytest.mark.deprecated
@@ -121,3 +124,87 @@ def test_percentile_clipping(gtype):
         torch.testing.assert_close(gnorm_vec1, torch.sqrt(gnorm_vec2))
         torch.testing.assert_close(clip1, clip2)
         torch.testing.assert_close(gnorm1, gnorm2)
+
+
+@pytest.mark.parametrize("dim1", get_test_dims(16, 64, n=1), ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", [*get_test_dims(32, 96, n=1), 0], ids=id_formatter("dim2"))
+@pytest.mark.parametrize("dim3", get_test_dims(32, 96, n=1), ids=id_formatter("dim3"))
+@pytest.mark.parametrize("dim4", get_test_dims(32, 96, n=1), ids=id_formatter("dim4"))
+@pytest.mark.parametrize("req_grad", BOOLEAN_TRIPLES, ids=id_formatter("req_grad"))
+@pytest.mark.parametrize("transpose", TRANSPOSE_VALS, ids=id_formatter("transpose"))
+@pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=describe_dtype)
+@pytest.mark.parametrize(
+    "funcs",
+    [(torch.matmul, bnb.research.matmul_fp8_mixed), (torch.matmul, bnb.research.matmul_fp8_global)],
+    ids=["matmul_fp8_mixed", "matmul_fp8_global"],
+)
+@pytest.mark.deprecated
+@pytest.mark.skip("Deprecated functionality, to be removed.")
+def test_matmul_fp8(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose):
+    dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2)
+    dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3)
+    req_grad = list(req_grad)
+    req_grad[2] = False
+
+    for i in range(3):
+        # normal multiply
+        if funcs[0] in [torch.mm, torch.matmul]:
+            A = torch.randn(size=dimA, device="cuda", requires_grad=req_grad[0], dtype=dtype)
+            B = torch.randn(size=dimB, device="cuda", requires_grad=req_grad[1], dtype=dtype)
+            target = torch.randn(size=(dim2, dim4), device="cuda", requires_grad=req_grad[1], dtype=dtype)
+
+            torch.nn.init.xavier_uniform_(B)
+
+            fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(A.device)
+            bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(A.device)
+
+            if not transpose[0] and transpose[1]:
+                out_torch = funcs[0](A, B.t())
+                out_bnb = funcs[1](A, B.t(), fw_code, bw_code)
+            elif not transpose[0] and not transpose[1]:
+                out_torch = funcs[0](A, B)
+                out_bnb = funcs[1](A, B, fw_code, bw_code)
+
+            assert out_bnb.dtype == A.dtype, f"bnb matmullt received {A.dtype} but returned {out_bnb.dtype}"
+
+            n = out_bnb.numel()
+            err = torch.abs(out_bnb - out_torch).float().mean().item()
+            if n > 0:
+                assert err < 0.115
+                # assert err < 0.20
+            if any(req_grad):
+                out_bnb.data.copy_(out_torch)
+                torch.cuda.synchronize()
+                loss_bnb = torch.nn.functional.mse_loss(out_bnb, target).mean()
+                loss_bnb.backward()
+                gradA1 = A.grad
+                gradB1 = B.grad
+                A.grad = None
+                B.grad = None
+
+                loss_torch = torch.nn.functional.mse_loss(out_torch, target).mean()
+                loss_torch.backward()
+                gradA2 = A.grad
+                gradB2 = B.grad
+                A.grad = None
+                B.grad = None
+
+                if req_grad[0]:
+                    torch.testing.assert_close(gradA1, gradA2, atol=0.015, rtol=0.1)
+
+                if req_grad[1]:
+                    n = gradB1.numel()
+                    if dim2 > 0:
+                        assert torch.abs(gradB1).sum() > 0.0
+                        assert torch.abs(gradB2).sum() > 0.0
+                    else:
+                        assert torch.abs(gradB1).sum() == 0.0
+                        assert torch.abs(gradB2).sum() == 0.0
+                    idx = torch.isclose(gradB1, gradB2, atol=0.06, rtol=0.3)
+
+                    assert (idx == 0).sum().item() <= n * 0.1
+                    idx = torch.isclose(gradB1, gradB2, atol=0.10, rtol=0.3)
+                    assert (idx == 0).sum().item() <= n * 0.02
+                    grad_err = (gradB1 - gradB2).abs().mean()
+                    assert grad_err.item() < 0.003
+                    torch.testing.assert_close(gradB1, gradB2, atol=0.18, rtol=0.3)
diff --git a/tests/test_functional.py b/tests/test_functional.py
@@ -893,7 +893,7 @@ def test_spmm_coo_very_sparse(self, dim1, dim2, dtype, out_func):
 
     @pytest.mark.parametrize("dim1", [256, 1024], ids=id_formatter("dim1"))
     @pytest.mark.parametrize("dim2", [256, 1024], ids=id_formatter("dim2"))
-    @pytest.skip("No longer supported")
+    @pytest.mark.skip("No longer supported")
     def test_integrated_sparse_decomp(self, dim1, dim2):
         threshold = 3.0
         for _ in range(k):
diff --git a/tests/test_generation.py b/tests/test_generation.py
@@ -60,7 +60,7 @@ def generate(model, tokenizer, text, generation_config, prompt_func=get_prompt_f
     return tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
-models = ["huggyllama/llama-7b", "bigscience/bloom-1b7"]
+models = ["bigscience/bloom-1b7"]
 dtypes = ["nf4", "fp4"]
 
 
diff --git a/tests/test_optim.py b/tests/test_optim.py
@@ -604,44 +604,3 @@ def test_benchmark_blockwise(dim1, dim2, gtype, optim_name):
     params = (total_steps - total_steps // 5) * dim1 * dim2
     print(optim_name, gtype, s, params, s / params)
     # assert s < 3.9
-
-
-@pytest.mark.parametrize("dim1", [2 * 1024], ids=id_formatter("dim1"))
-@pytest.mark.parametrize("gtype", [torch.float16], ids=describe_dtype)
-@pytest.mark.parametrize("optim_name", ["paged_adamw"], ids=id_formatter("optim_name"))
-@pytest.mark.parametrize("mode", ["bnb"], ids=id_formatter("mode"))
-@pytest.mark.benchmark
-def test_stream_optimizer_bench(dim1, gtype, optim_name, mode):
-    layers1 = torch.nn.Sequential(*torch.nn.ModuleList([torch.nn.Linear(dim1, dim1) for i in range(10)]))
-    layers1 = layers1.to(gtype)
-    layers1 = layers1.cuda()
-
-    large_tensor = None
-    if mode == "torch":
-        optim = str2optimizers[optim_name][0](layers1.parameters())
-    else:
-        optim = str2optimizers[optim_name][1](layers1.parameters())
-        # 12 GB
-        large_tensor = torch.empty((int(4.5e9),), device="cuda")
-
-    torch.cuda.synchronize()
-    time.sleep(5)
-
-    num_batches = 5
-    batches = torch.randn(num_batches, 128, dim1, device="cuda").to(gtype)
-    lbls = torch.randint(0, 10, size=(num_batches, 128)).cuda()
-
-    for i in range(num_batches):
-        print(i)
-        b = batches[i]
-        if i == 2:
-            torch.cuda.synchronize()
-            t0 = time.time()
-
-        out1 = layers1(b)
-
-        loss1 = torch.nn.functional.cross_entropy(out1, lbls[i]).mean()
-        loss1.backward()
-        optim.step()
-    torch.cuda.synchronize()
-    print(mode, time.time() - t0)
diff --git a/tests/test_triton.py b/tests/test_triton.py
@@ -11,6 +11,7 @@
     not is_triton_available() or not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 8,
     reason="This test requires triton and a GPU with compute capability 8.0 or higher.",
 )
+@pytest.mark.skip("No longer supported.")
 @pytest.mark.parametrize("vector_wise_quantization", TRUE_FALSE)
 def test_switchback(vector_wise_quantization):
     for dim in [83]:

Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,7 @@`
`11`	`11`	`not is_triton_available() or not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 8,`
`12`	`12`	`reason="This test requires triton and a GPU with compute capability 8.0 or higher.",`
`13`	`13`	`)`
	`14`	`+@pytest.mark.skip("No longer supported.")`
`14`	`15`	`@pytest.mark.parametrize("vector_wise_quantization", TRUE_FALSE)`
`15`	`16`	`def test_switchback(vector_wise_quantization):`
`16`	`17`	`for dim in [83]:`