============================= test session starts =============================
platform win32 -- Python 3.12.10, pytest-9.0.2, pluggy-1.6.0
rootdir: C:\projects\bnb
configfile: pyproject.toml
plugins: anyio-4.12.1
collected 4206 items / 182 deselected / 4024 selected

tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  0%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  1%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  2%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  3%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  4%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  5%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  6%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  7%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  8%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [  9%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 10%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 11%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 12%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 13%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 14%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 15%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 16%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 17%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 18%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=T-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 19%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 20%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=TFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 21%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 22%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FT-req_grad=FFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 23%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 24%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 25%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=TFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 26%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 27%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=T-transpose=FF-req_grad=FFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 28%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 29%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 30%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=TFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 31%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 32%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FT-req_grad=FFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 33%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 34%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=TFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 35%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FTF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 36%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFT-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFT-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFT-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFT-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFT-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFF-bf16-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFF-bf16-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFF-bf16-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFF-bf16-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 37%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp32-func=matmul-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp32-func=matmul-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp32-func=switchback_bnb-decomp=0.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=64-dim1=40-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmullt[has_bias=F-has_fp16_weights=F-transpose=FF-req_grad=FFF-fp32-func=switchback_bnb-decomp=6.0-dim4=48-dim3=32-dim2=0-dim1=40-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 38%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 39%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 40%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 41%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 42%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 43%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=fp4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 44%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=T-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 45%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp16-has_bias=F-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=T-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 46%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=T-fp32-has_bias=F-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 47%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=T-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 48%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp16-has_bias=F-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 49%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=T-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FT-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=TTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=TTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=TFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=TFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=FTT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=FTF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=FFT-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=64-dim1=48-cuda] PASSED [ 50%]
tests/test_autograd.py::test_matmul_4bit[quant_type=nf4-compress_statistics=F-fp32-has_bias=F-transpose=FF-req_grad=FFF-func=matmul-dim4=96-dim3=64-dim2=0-dim1=48-cuda] PASSED [ 50%]
tests/test_cuda_setup_evaluator.py::test_get_cuda_bnb_library_path SKIPPED [ 50%]
tests/test_cuda_setup_evaluator.py::test_get_cuda_bnb_library_path_override SKIPPED [ 50%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-4096-nested=T-fp32-cuda] PASSED [ 50%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-4096-nested=T-fp16-cuda] PASSED [ 50%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-4096-nested=T-bf16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-4096-nested=F-fp32-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-4096-nested=F-fp16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-4096-nested=F-bf16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-2048-nested=T-fp32-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-2048-nested=T-fp16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-2048-nested=T-bf16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-2048-nested=F-fp32-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-2048-nested=F-fp16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-2048-nested=F-bf16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-1024-nested=T-fp32-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-1024-nested=T-fp16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-1024-nested=T-bf16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-1024-nested=F-fp32-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-1024-nested=F-fp16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-1024-nested=F-bf16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-512-nested=T-fp32-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-512-nested=T-fp16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-512-nested=T-bf16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-512-nested=F-fp32-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-512-nested=F-fp16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-512-nested=F-bf16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-256-nested=T-fp32-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-256-nested=T-fp16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-256-nested=T-bf16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-256-nested=F-fp32-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-256-nested=F-fp16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-256-nested=F-bf16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-128-nested=T-fp32-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-128-nested=T-fp16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-128-nested=T-bf16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-128-nested=F-fp32-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-128-nested=F-fp16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-128-nested=F-bf16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-64-nested=T-fp32-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-64-nested=T-fp16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-64-nested=T-bf16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-64-nested=F-fp32-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-64-nested=F-fp16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=T-64-nested=F-bf16-cuda] PASSED [ 51%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-4096-nested=T-fp32-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-4096-nested=T-fp16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-4096-nested=T-bf16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-4096-nested=F-fp32-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-4096-nested=F-fp16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-4096-nested=F-bf16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-2048-nested=T-fp32-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-2048-nested=T-fp16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-2048-nested=T-bf16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-2048-nested=F-fp32-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-2048-nested=F-fp16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-2048-nested=F-bf16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-1024-nested=T-fp32-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-1024-nested=T-fp16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-1024-nested=T-bf16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-1024-nested=F-fp32-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-1024-nested=F-fp16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-1024-nested=F-bf16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-512-nested=T-fp32-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-512-nested=T-fp16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-512-nested=T-bf16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-512-nested=F-fp32-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-512-nested=F-fp16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-512-nested=F-bf16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-256-nested=T-fp32-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-256-nested=T-fp16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-256-nested=T-bf16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-256-nested=F-fp32-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-256-nested=F-fp16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-256-nested=F-bf16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-128-nested=T-fp32-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-128-nested=T-fp16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-128-nested=T-bf16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-128-nested=F-fp32-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-128-nested=F-fp16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-128-nested=F-bf16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-64-nested=T-fp32-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-64-nested=T-fp16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-64-nested=T-bf16-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-64-nested=F-fp32-cuda] PASSED [ 52%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-64-nested=F-fp16-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-64-nested=F-bf16-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization_large[blocksize=256-fp32-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization_large[blocksize=256-fp16-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization_large[blocksize=256-bf16-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_blockwise_cpu_large[4096-128] SKIPPED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_blockwise_cpu_large[16384-128] SKIPPED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_few_bit_quant[linear-bits=2-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_few_bit_quant[linear-bits=3-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_few_bit_quant[linear-bits=4-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_few_bit_quant[linear-bits=5-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_few_bit_quant[linear-bits=6-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_few_bit_quant[linear-bits=7-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_few_bit_quant[linear-bits=8-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_few_bit_quant[fp8-bits=2-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_few_bit_quant[fp8-bits=3-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_few_bit_quant[fp8-bits=4-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_few_bit_quant[fp8-bits=5-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_few_bit_quant[fp8-bits=6-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_few_bit_quant[fp8-bits=7-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_few_bit_quant[fp8-bits=8-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_few_bit_quant[dynamic-bits=2-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_few_bit_quant[dynamic-bits=3-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_few_bit_quant[dynamic-bits=4-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_few_bit_quant[dynamic-bits=5-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_few_bit_quant[dynamic-bits=6-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_few_bit_quant[dynamic-bits=7-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_few_bit_quant[dynamic-bits=8-cuda] PASSED [ 53%]
tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_fp8_quant[cuda] PASSED [ 53%]
tests/test_functional.py::test_stable_embedding PASSED                   [ 53%]
tests/test_functional.py::TestIGEMMFunctional::test_approx_igemm[batched=T-linear-dim2=16384-dim1=2048] PASSED [ 53%]
tests/test_functional.py::TestIGEMMFunctional::test_approx_igemm[batched=T-vectorwise-dim2=16384-dim1=2048] PASSED [ 53%]
tests/test_functional.py::TestIGEMMFunctional::test_approx_igemm[batched=F-linear-dim2=16384-dim1=2048] PASSED [ 53%]
tests/test_functional.py::TestIGEMMFunctional::test_approx_igemm[batched=F-vectorwise-dim2=16384-dim1=2048] PASSED [ 53%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=TT-seq_dim=16-batch_dim=16-hidden_dim=32] PASSED [ 53%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=TT-seq_dim=16-batch_dim=16-hidden_dim=256] PASSED [ 53%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=TT-seq_dim=16-batch_dim=256-hidden_dim=32] PASSED [ 53%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=TT-seq_dim=16-batch_dim=256-hidden_dim=256] PASSED [ 53%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=TT-seq_dim=256-batch_dim=16-hidden_dim=32] PASSED [ 53%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=TT-seq_dim=256-batch_dim=16-hidden_dim=256] PASSED [ 53%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=TT-seq_dim=256-batch_dim=256-hidden_dim=32] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=TT-seq_dim=256-batch_dim=256-hidden_dim=256] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=TF-seq_dim=16-batch_dim=16-hidden_dim=32] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=TF-seq_dim=16-batch_dim=16-hidden_dim=256] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=TF-seq_dim=16-batch_dim=256-hidden_dim=32] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=TF-seq_dim=16-batch_dim=256-hidden_dim=256] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=TF-seq_dim=256-batch_dim=16-hidden_dim=32] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=TF-seq_dim=256-batch_dim=16-hidden_dim=256] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=TF-seq_dim=256-batch_dim=256-hidden_dim=32] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=TF-seq_dim=256-batch_dim=256-hidden_dim=256] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=FT-seq_dim=16-batch_dim=16-hidden_dim=32] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=FT-seq_dim=16-batch_dim=16-hidden_dim=256] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=FT-seq_dim=16-batch_dim=256-hidden_dim=32] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=FT-seq_dim=16-batch_dim=256-hidden_dim=256] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=FT-seq_dim=256-batch_dim=16-hidden_dim=32] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=FT-seq_dim=256-batch_dim=16-hidden_dim=256] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=FT-seq_dim=256-batch_dim=256-hidden_dim=32] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=FT-seq_dim=256-batch_dim=256-hidden_dim=256] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=FF-seq_dim=16-batch_dim=16-hidden_dim=32] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=FF-seq_dim=16-batch_dim=16-hidden_dim=256] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=FF-seq_dim=16-batch_dim=256-hidden_dim=32] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=FF-seq_dim=16-batch_dim=256-hidden_dim=256] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=FF-seq_dim=256-batch_dim=16-hidden_dim=32] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=FF-seq_dim=256-batch_dim=16-hidden_dim=256] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=FF-seq_dim=256-batch_dim=256-hidden_dim=32] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_igemm[transpose=FF-seq_dim=256-batch_dim=256-hidden_dim=256] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=2-hidden_dim=64-seq_dim=32] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=2-hidden_dim=64-seq_dim=256] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=2-hidden_dim=64-seq_dim=512] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=2-hidden_dim=1024-seq_dim=32] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=2-hidden_dim=1024-seq_dim=256] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=2-hidden_dim=1024-seq_dim=512] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=2-hidden_dim=4096-seq_dim=32] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=2-hidden_dim=4096-seq_dim=256] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=2-hidden_dim=4096-seq_dim=512] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=8-hidden_dim=64-seq_dim=32] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=8-hidden_dim=64-seq_dim=256] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=8-hidden_dim=64-seq_dim=512] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=8-hidden_dim=1024-seq_dim=32] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=8-hidden_dim=1024-seq_dim=256] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=8-hidden_dim=1024-seq_dim=512] PASSED [ 54%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=8-hidden_dim=4096-seq_dim=32] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=8-hidden_dim=4096-seq_dim=256] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=8-hidden_dim=4096-seq_dim=512] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=16-hidden_dim=64-seq_dim=32] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=16-hidden_dim=64-seq_dim=256] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=16-hidden_dim=64-seq_dim=512] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=16-hidden_dim=1024-seq_dim=32] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=16-hidden_dim=1024-seq_dim=256] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=16-hidden_dim=1024-seq_dim=512] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=16-hidden_dim=4096-seq_dim=32] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=16-hidden_dim=4096-seq_dim=256] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_dim3_igemm[batch_dim=16-hidden_dim=4096-seq_dim=512] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_minmax_igemm[transpose=T-batch_dim=2-hidden_dim=32-seq_dim=32] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_minmax_igemm[transpose=T-batch_dim=2-hidden_dim=32-seq_dim=512] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_minmax_igemm[transpose=T-batch_dim=2-hidden_dim=4096-seq_dim=32] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_minmax_igemm[transpose=T-batch_dim=2-hidden_dim=4096-seq_dim=512] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_minmax_igemm[transpose=T-batch_dim=16-hidden_dim=32-seq_dim=32] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_minmax_igemm[transpose=T-batch_dim=16-hidden_dim=32-seq_dim=512] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_minmax_igemm[transpose=T-batch_dim=16-hidden_dim=4096-seq_dim=32] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_minmax_igemm[transpose=T-batch_dim=16-hidden_dim=4096-seq_dim=512] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_minmax_igemm[transpose=F-batch_dim=2-hidden_dim=32-seq_dim=32] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_minmax_igemm[transpose=F-batch_dim=2-hidden_dim=32-seq_dim=512] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_minmax_igemm[transpose=F-batch_dim=2-hidden_dim=4096-seq_dim=32] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_minmax_igemm[transpose=F-batch_dim=2-hidden_dim=4096-seq_dim=512] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_minmax_igemm[transpose=F-batch_dim=16-hidden_dim=32-seq_dim=32] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_minmax_igemm[transpose=F-batch_dim=16-hidden_dim=32-seq_dim=512] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_minmax_igemm[transpose=F-batch_dim=16-hidden_dim=4096-seq_dim=32] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_minmax_igemm[transpose=F-batch_dim=16-hidden_dim=4096-seq_dim=512] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TT-dim4=32-dim3=32-dim2=32-dim1=1] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TT-dim4=32-dim3=32-dim2=32-dim1=64] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TT-dim4=32-dim3=32-dim2=128-dim1=1] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TT-dim4=32-dim3=32-dim2=128-dim1=64] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TT-dim4=32-dim3=256-dim2=32-dim1=1] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TT-dim4=32-dim3=256-dim2=32-dim1=64] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TT-dim4=32-dim3=256-dim2=128-dim1=1] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TT-dim4=32-dim3=256-dim2=128-dim1=64] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TT-dim4=256-dim3=32-dim2=32-dim1=1] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TT-dim4=256-dim3=32-dim2=32-dim1=64] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TT-dim4=256-dim3=32-dim2=128-dim1=1] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TT-dim4=256-dim3=32-dim2=128-dim1=64] PASSED [ 55%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TT-dim4=256-dim3=256-dim2=32-dim1=1] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TT-dim4=256-dim3=256-dim2=32-dim1=64] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TT-dim4=256-dim3=256-dim2=128-dim1=1] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TT-dim4=256-dim3=256-dim2=128-dim1=64] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TF-dim4=32-dim3=32-dim2=32-dim1=1] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TF-dim4=32-dim3=32-dim2=32-dim1=64] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TF-dim4=32-dim3=32-dim2=128-dim1=1] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TF-dim4=32-dim3=32-dim2=128-dim1=64] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TF-dim4=32-dim3=256-dim2=32-dim1=1] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TF-dim4=32-dim3=256-dim2=32-dim1=64] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TF-dim4=32-dim3=256-dim2=128-dim1=1] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TF-dim4=32-dim3=256-dim2=128-dim1=64] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TF-dim4=256-dim3=32-dim2=32-dim1=1] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TF-dim4=256-dim3=32-dim2=32-dim1=64] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TF-dim4=256-dim3=32-dim2=128-dim1=1] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TF-dim4=256-dim3=32-dim2=128-dim1=64] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TF-dim4=256-dim3=256-dim2=32-dim1=1] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TF-dim4=256-dim3=256-dim2=32-dim1=64] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TF-dim4=256-dim3=256-dim2=128-dim1=1] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=TF-dim4=256-dim3=256-dim2=128-dim1=64] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FT-dim4=32-dim3=32-dim2=32-dim1=1] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FT-dim4=32-dim3=32-dim2=32-dim1=64] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FT-dim4=32-dim3=32-dim2=128-dim1=1] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FT-dim4=32-dim3=32-dim2=128-dim1=64] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FT-dim4=32-dim3=256-dim2=32-dim1=1] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FT-dim4=32-dim3=256-dim2=32-dim1=64] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FT-dim4=32-dim3=256-dim2=128-dim1=1] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FT-dim4=32-dim3=256-dim2=128-dim1=64] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FT-dim4=256-dim3=32-dim2=32-dim1=1] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FT-dim4=256-dim3=32-dim2=32-dim1=64] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FT-dim4=256-dim3=32-dim2=128-dim1=1] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FT-dim4=256-dim3=32-dim2=128-dim1=64] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FT-dim4=256-dim3=256-dim2=32-dim1=1] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FT-dim4=256-dim3=256-dim2=32-dim1=64] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FT-dim4=256-dim3=256-dim2=128-dim1=1] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FT-dim4=256-dim3=256-dim2=128-dim1=64] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FF-dim4=32-dim3=32-dim2=32-dim1=1] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FF-dim4=32-dim3=32-dim2=32-dim1=64] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FF-dim4=32-dim3=32-dim2=128-dim1=1] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FF-dim4=32-dim3=32-dim2=128-dim1=64] PASSED [ 56%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FF-dim4=32-dim3=256-dim2=32-dim1=1] PASSED [ 57%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FF-dim4=32-dim3=256-dim2=32-dim1=64] PASSED [ 57%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FF-dim4=32-dim3=256-dim2=128-dim1=1] PASSED [ 57%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FF-dim4=32-dim3=256-dim2=128-dim1=64] PASSED [ 57%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FF-dim4=256-dim3=32-dim2=32-dim1=1] PASSED [ 57%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FF-dim4=256-dim3=32-dim2=32-dim1=64] PASSED [ 57%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FF-dim4=256-dim3=32-dim2=128-dim1=1] PASSED [ 57%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FF-dim4=256-dim3=32-dim2=128-dim1=64] PASSED [ 57%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FF-dim4=256-dim3=256-dim2=32-dim1=1] PASSED [ 57%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FF-dim4=256-dim3=256-dim2=32-dim1=64] PASSED [ 57%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FF-dim4=256-dim3=256-dim2=128-dim1=1] PASSED [ 57%]
tests/test_functional.py::TestIGEMMFunctional::test_ibmm[transpose=FF-dim4=256-dim3=256-dim2=128-dim1=64] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_int8_linear_matmul[ldb=0-dims=2-dim4=512-dim3=499-dim2=256-dim1=128-cuda] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_int8_linear_matmul[ldb=0-dims=2-dim4=512-dim3=512-dim2=256-dim1=128-cuda] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_int8_linear_matmul[ldb=0-dims=3-dim4=512-dim3=499-dim2=256-dim1=128-cuda] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_int8_linear_matmul[ldb=0-dims=3-dim4=512-dim3=512-dim2=256-dim1=128-cuda] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_int8_linear_matmul_half[dims=2-dim4=32-dim3=32-dim2=32-dim1=32-cuda] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_dequant_mm[has_bias=T-dims=2-dim4=64-dim1=64-cuda] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_dequant_mm[has_bias=T-dims=2-dim4=64-dim1=256-cuda] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_dequant_mm[has_bias=T-dims=2-dim4=1024-dim1=64-cuda] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_dequant_mm[has_bias=T-dims=2-dim4=1024-dim1=256-cuda] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_dequant_mm[has_bias=F-dims=2-dim4=64-dim1=64-cuda] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_dequant_mm[has_bias=F-dims=2-dim4=64-dim1=256-cuda] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_dequant_mm[has_bias=F-dims=2-dim4=1024-dim1=64-cuda] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_dequant_mm[has_bias=F-dims=2-dim4=1024-dim1=256-cuda] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_integrated_int8_linear_matmul[dim1=1,dim4=2,inner=4-cuda] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_integrated_int8_linear_matmul[dim1=8,dim4=128,inner=256-cuda] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_integrated_int8_linear_matmul[dim1=2048,dim4=2048,inner=512-cuda] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_integrated_int8_linear_matmul[dim1=4096,dim4=4096,inner=4096-cuda] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_coo_double_quant[dim2=1024-dim1=512-cuda] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_coo_double_quant[dim2=1024-dim1=2048-cuda] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_coo_double_quant[dim2=4096-dim1=512-cuda] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_coo_double_quant[dim2=4096-dim1=2048-cuda] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_coo_int8_vectorwise_quant[dim2=1024-dim1=512-cuda] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_coo_int8_vectorwise_quant[dim2=1024-dim1=2048-cuda] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_coo_int8_vectorwise_quant[dim2=4096-dim1=512-cuda] PASSED [ 57%]
tests/test_functional.py::TestLLMInt8Functional::test_coo_int8_vectorwise_quant[dim2=4096-dim1=2048-cuda] PASSED [ 57%]
tests/test_functional.py::TestSpMMFunctional::test_spmm_coo[transposed_B=T-dim2=128-dim1=256] SKIPPED [ 57%]
tests/test_functional.py::TestSpMMFunctional::test_spmm_coo[transposed_B=T-dim2=128-dim1=1024] SKIPPED [ 57%]
tests/test_functional.py::TestSpMMFunctional::test_spmm_coo[transposed_B=T-dim2=512-dim1=256] SKIPPED [ 57%]
tests/test_functional.py::TestSpMMFunctional::test_spmm_coo[transposed_B=T-dim2=512-dim1=1024] SKIPPED [ 58%]
tests/test_functional.py::TestSpMMFunctional::test_spmm_coo[transposed_B=F-dim2=128-dim1=256] SKIPPED [ 58%]
tests/test_functional.py::TestSpMMFunctional::test_spmm_coo[transposed_B=F-dim2=128-dim1=1024] SKIPPED [ 58%]
tests/test_functional.py::TestSpMMFunctional::test_spmm_coo[transposed_B=F-dim2=512-dim1=256] SKIPPED [ 58%]
tests/test_functional.py::TestSpMMFunctional::test_spmm_coo[transposed_B=F-dim2=512-dim1=1024] SKIPPED [ 58%]
tests/test_functional.py::TestSpMMFunctional::test_spmm_coo_very_sparse[out_func=zeros-fp16-dim2=12288-dim1=2048] SKIPPED [ 58%]
tests/test_functional.py::TestSpMMFunctional::test_spmm_coo_very_sparse[out_func=ones-fp16-dim2=12288-dim1=2048] SKIPPED [ 58%]
tests/test_functional.py::TestSpMMFunctional::test_spmm_coo_dequant[dtype0-2048-2048] SKIPPED [ 58%]
tests/test_functional.py::TestSparseTensorFunctional::test_coo2csr PASSED [ 58%]
tests/test_functional.py::TestSparseTensorFunctional::test_coo2csc PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[64-fp4-fp32-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[64-fp4-fp16-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[64-fp4-bf16-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[64-nf4-fp32-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[64-nf4-fp16-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[64-nf4-bf16-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[128-fp4-fp32-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[128-fp4-fp16-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[128-fp4-bf16-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[128-nf4-fp32-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[128-nf4-fp16-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[128-nf4-bf16-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[256-fp4-fp32-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[256-fp4-fp16-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[256-fp4-bf16-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[256-nf4-fp32-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[256-nf4-fp16-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[256-nf4-bf16-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[512-fp4-fp32-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[512-fp4-fp16-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[512-fp4-bf16-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[512-nf4-fp32-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[512-nf4-fp16-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[512-nf4-bf16-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[1024-fp4-fp32-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[1024-fp4-fp16-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[1024-fp4-bf16-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[1024-nf4-fp32-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[1024-nf4-fp16-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[1024-nf4-bf16-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[2048-fp4-fp32-cuda] PASSED [ 58%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[2048-fp4-fp16-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[2048-fp4-bf16-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[2048-nf4-fp32-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[2048-nf4-fp16-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[2048-nf4-bf16-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[4096-fp4-fp32-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[4096-fp4-fp16-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[4096-fp4-bf16-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[4096-nf4-fp32-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[4096-nf4-fp16-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[4096-nf4-bf16-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_compressed_stats[fp32-blocksize=64-fp4-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_compressed_stats[fp32-blocksize=64-nf4-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_compressed_stats[fp32-blocksize=128-fp4-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_compressed_stats[fp32-blocksize=128-nf4-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_compressed_stats[fp16-blocksize=64-fp4-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_compressed_stats[fp16-blocksize=64-nf4-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_compressed_stats[fp16-blocksize=128-fp4-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_compressed_stats[fp16-blocksize=128-nf4-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant_large[blocksize=64-fp4-fp32-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant_large[blocksize=64-fp4-fp16-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant_large[blocksize=64-fp4-bf16-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant_large[blocksize=64-nf4-fp32-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant_large[blocksize=64-nf4-fp16-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant_large[blocksize=64-nf4-bf16-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant_large[blocksize=128-fp4-fp32-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant_large[blocksize=128-fp4-fp16-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant_large[blocksize=128-fp4-bf16-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant_large[blocksize=128-nf4-fp32-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant_large[blocksize=128-nf4-fp16-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant_large[blocksize=128-nf4-bf16-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp16-fc1-nf4-DQ_True-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp16-fc1-nf4-DQ_False-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp16-fc1-fp4-DQ_True-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp16-fc1-fp4-DQ_False-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp16-fc2-nf4-DQ_True-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp16-fc2-nf4-DQ_False-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp16-fc2-fp4-DQ_True-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp16-fc2-fp4-DQ_False-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp16-attn-nf4-DQ_True-cuda] PASSED [ 59%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp16-attn-nf4-DQ_False-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp16-attn-fp4-DQ_True-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp16-attn-fp4-DQ_False-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp16-attn_packed-nf4-DQ_True-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp16-attn_packed-nf4-DQ_False-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp16-attn_packed-fp4-DQ_True-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp16-attn_packed-fp4-DQ_False-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-bf16-fc1-nf4-DQ_True-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-bf16-fc1-nf4-DQ_False-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-bf16-fc1-fp4-DQ_True-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-bf16-fc1-fp4-DQ_False-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-bf16-fc2-nf4-DQ_True-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-bf16-fc2-nf4-DQ_False-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-bf16-fc2-fp4-DQ_True-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-bf16-fc2-fp4-DQ_False-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-bf16-attn-nf4-DQ_True-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-bf16-attn-nf4-DQ_False-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-bf16-attn-fp4-DQ_True-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-bf16-attn-fp4-DQ_False-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-bf16-attn_packed-nf4-DQ_True-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-bf16-attn_packed-nf4-DQ_False-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-bf16-attn_packed-fp4-DQ_True-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-bf16-attn_packed-fp4-DQ_False-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp32-fc1-nf4-DQ_True-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp32-fc1-nf4-DQ_False-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp32-fc1-fp4-DQ_True-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp32-fc1-fp4-DQ_False-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp32-fc2-nf4-DQ_True-cuda] FAILED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp32-fc2-nf4-DQ_False-cuda] FAILED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp32-fc2-fp4-DQ_True-cuda] FAILED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp32-fc2-fp4-DQ_False-cuda] FAILED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp32-attn-nf4-DQ_True-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp32-attn-nf4-DQ_False-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp32-attn-fp4-DQ_True-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp32-attn-fp4-DQ_False-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp32-attn_packed-nf4-DQ_True-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp32-attn_packed-nf4-DQ_False-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp32-attn_packed-fp4-DQ_True-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-uint8-fp32-attn_packed-fp4-DQ_False-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp16-fc1-nf4-DQ_True-cuda] PASSED [ 60%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp16-fc1-nf4-DQ_False-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp16-fc1-fp4-DQ_True-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp16-fc1-fp4-DQ_False-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp16-fc2-nf4-DQ_True-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp16-fc2-nf4-DQ_False-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp16-fc2-fp4-DQ_True-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp16-fc2-fp4-DQ_False-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp16-attn-nf4-DQ_True-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp16-attn-nf4-DQ_False-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp16-attn-fp4-DQ_True-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp16-attn-fp4-DQ_False-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp16-attn_packed-nf4-DQ_True-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp16-attn_packed-nf4-DQ_False-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp16-attn_packed-fp4-DQ_True-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp16-attn_packed-fp4-DQ_False-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-bf16-fc1-nf4-DQ_True-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-bf16-fc1-nf4-DQ_False-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-bf16-fc1-fp4-DQ_True-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-bf16-fc1-fp4-DQ_False-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-bf16-fc2-nf4-DQ_True-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-bf16-fc2-nf4-DQ_False-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-bf16-fc2-fp4-DQ_True-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-bf16-fc2-fp4-DQ_False-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-bf16-attn-nf4-DQ_True-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-bf16-attn-nf4-DQ_False-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-bf16-attn-fp4-DQ_True-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-bf16-attn-fp4-DQ_False-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-bf16-attn_packed-nf4-DQ_True-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-bf16-attn_packed-nf4-DQ_False-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-bf16-attn_packed-fp4-DQ_True-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-bf16-attn_packed-fp4-DQ_False-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp32-fc1-nf4-DQ_True-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp32-fc1-nf4-DQ_False-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp32-fc1-fp4-DQ_True-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp32-fc1-fp4-DQ_False-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp32-fc2-nf4-DQ_True-cuda] FAILED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp32-fc2-nf4-DQ_False-cuda] FAILED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp32-fc2-fp4-DQ_True-cuda] FAILED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp32-fc2-fp4-DQ_False-cuda] FAILED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp32-attn-nf4-DQ_True-cuda] PASSED [ 61%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp32-attn-nf4-DQ_False-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp32-attn-fp4-DQ_True-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp32-attn-fp4-DQ_False-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp32-attn_packed-nf4-DQ_True-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp32-attn_packed-nf4-DQ_False-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp32-attn_packed-fp4-DQ_True-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp16-fp32-attn_packed-fp4-DQ_False-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp16-fc1-nf4-DQ_True-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp16-fc1-nf4-DQ_False-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp16-fc1-fp4-DQ_True-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp16-fc1-fp4-DQ_False-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp16-fc2-nf4-DQ_True-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp16-fc2-nf4-DQ_False-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp16-fc2-fp4-DQ_True-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp16-fc2-fp4-DQ_False-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp16-attn-nf4-DQ_True-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp16-attn-nf4-DQ_False-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp16-attn-fp4-DQ_True-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp16-attn-fp4-DQ_False-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp16-attn_packed-nf4-DQ_True-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp16-attn_packed-nf4-DQ_False-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp16-attn_packed-fp4-DQ_True-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp16-attn_packed-fp4-DQ_False-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-bf16-fc1-nf4-DQ_True-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-bf16-fc1-nf4-DQ_False-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-bf16-fc1-fp4-DQ_True-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-bf16-fc1-fp4-DQ_False-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-bf16-fc2-nf4-DQ_True-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-bf16-fc2-nf4-DQ_False-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-bf16-fc2-fp4-DQ_True-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-bf16-fc2-fp4-DQ_False-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-bf16-attn-nf4-DQ_True-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-bf16-attn-nf4-DQ_False-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-bf16-attn-fp4-DQ_True-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-bf16-attn-fp4-DQ_False-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-bf16-attn_packed-nf4-DQ_True-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-bf16-attn_packed-nf4-DQ_False-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-bf16-attn_packed-fp4-DQ_True-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-bf16-attn_packed-fp4-DQ_False-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp32-fc1-nf4-DQ_True-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp32-fc1-nf4-DQ_False-cuda] PASSED [ 62%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp32-fc1-fp4-DQ_True-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp32-fc1-fp4-DQ_False-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp32-fc2-nf4-DQ_True-cuda] FAILED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp32-fc2-nf4-DQ_False-cuda] FAILED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp32-fc2-fp4-DQ_True-cuda] FAILED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp32-fc2-fp4-DQ_False-cuda] FAILED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp32-attn-nf4-DQ_True-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp32-attn-nf4-DQ_False-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp32-attn-fp4-DQ_True-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp32-attn-fp4-DQ_False-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp32-attn_packed-nf4-DQ_True-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp32-attn_packed-nf4-DQ_False-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp32-attn_packed-fp4-DQ_True-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-bf16-fp32-attn_packed-fp4-DQ_False-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp16-fc1-nf4-DQ_True-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp16-fc1-nf4-DQ_False-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp16-fc1-fp4-DQ_True-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp16-fc1-fp4-DQ_False-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp16-fc2-nf4-DQ_True-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp16-fc2-nf4-DQ_False-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp16-fc2-fp4-DQ_True-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp16-fc2-fp4-DQ_False-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp16-attn-nf4-DQ_True-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp16-attn-nf4-DQ_False-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp16-attn-fp4-DQ_True-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp16-attn-fp4-DQ_False-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp16-attn_packed-nf4-DQ_True-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp16-attn_packed-nf4-DQ_False-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp16-attn_packed-fp4-DQ_True-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp16-attn_packed-fp4-DQ_False-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-bf16-fc1-nf4-DQ_True-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-bf16-fc1-nf4-DQ_False-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-bf16-fc1-fp4-DQ_True-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-bf16-fc1-fp4-DQ_False-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-bf16-fc2-nf4-DQ_True-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-bf16-fc2-nf4-DQ_False-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-bf16-fc2-fp4-DQ_True-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-bf16-fc2-fp4-DQ_False-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-bf16-attn-nf4-DQ_True-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-bf16-attn-nf4-DQ_False-cuda] PASSED [ 63%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-bf16-attn-fp4-DQ_True-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-bf16-attn-fp4-DQ_False-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-bf16-attn_packed-nf4-DQ_True-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-bf16-attn_packed-nf4-DQ_False-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-bf16-attn_packed-fp4-DQ_True-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-bf16-attn_packed-fp4-DQ_False-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp32-fc1-nf4-DQ_True-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp32-fc1-nf4-DQ_False-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp32-fc1-fp4-DQ_True-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp32-fc1-fp4-DQ_False-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp32-fc2-nf4-DQ_True-cuda] FAILED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp32-fc2-nf4-DQ_False-cuda] FAILED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp32-fc2-fp4-DQ_True-cuda] FAILED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp32-fc2-fp4-DQ_False-cuda] FAILED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp32-attn-nf4-DQ_True-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp32-attn-nf4-DQ_False-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp32-attn-fp4-DQ_True-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp32-attn-fp4-DQ_False-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp32-attn_packed-nf4-DQ_True-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp32-attn_packed-nf4-DQ_False-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp32-attn_packed-fp4-DQ_True-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=128-fp32-fp32-attn_packed-fp4-DQ_False-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp16-fc1-nf4-DQ_True-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp16-fc1-nf4-DQ_False-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp16-fc1-fp4-DQ_True-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp16-fc1-fp4-DQ_False-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp16-fc2-nf4-DQ_True-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp16-fc2-nf4-DQ_False-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp16-fc2-fp4-DQ_True-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp16-fc2-fp4-DQ_False-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp16-attn-nf4-DQ_True-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp16-attn-nf4-DQ_False-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp16-attn-fp4-DQ_True-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp16-attn-fp4-DQ_False-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp16-attn_packed-nf4-DQ_True-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp16-attn_packed-nf4-DQ_False-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp16-attn_packed-fp4-DQ_True-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp16-attn_packed-fp4-DQ_False-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-bf16-fc1-nf4-DQ_True-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-bf16-fc1-nf4-DQ_False-cuda] PASSED [ 64%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-bf16-fc1-fp4-DQ_True-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-bf16-fc1-fp4-DQ_False-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-bf16-fc2-nf4-DQ_True-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-bf16-fc2-nf4-DQ_False-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-bf16-fc2-fp4-DQ_True-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-bf16-fc2-fp4-DQ_False-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-bf16-attn-nf4-DQ_True-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-bf16-attn-nf4-DQ_False-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-bf16-attn-fp4-DQ_True-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-bf16-attn-fp4-DQ_False-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-bf16-attn_packed-nf4-DQ_True-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-bf16-attn_packed-nf4-DQ_False-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-bf16-attn_packed-fp4-DQ_True-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-bf16-attn_packed-fp4-DQ_False-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp32-fc1-nf4-DQ_True-cuda] FAILED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp32-fc1-nf4-DQ_False-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp32-fc1-fp4-DQ_True-cuda] FAILED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp32-fc1-fp4-DQ_False-cuda] FAILED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp32-fc2-nf4-DQ_True-cuda] FAILED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp32-fc2-nf4-DQ_False-cuda] FAILED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp32-fc2-fp4-DQ_True-cuda] FAILED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp32-fc2-fp4-DQ_False-cuda] FAILED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp32-attn-nf4-DQ_True-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp32-attn-nf4-DQ_False-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp32-attn-fp4-DQ_True-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp32-attn-fp4-DQ_False-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp32-attn_packed-nf4-DQ_True-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp32-attn_packed-nf4-DQ_False-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp32-attn_packed-fp4-DQ_True-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-fp32-attn_packed-fp4-DQ_False-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp16-fc1-nf4-DQ_True-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp16-fc1-nf4-DQ_False-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp16-fc1-fp4-DQ_True-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp16-fc1-fp4-DQ_False-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp16-fc2-nf4-DQ_True-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp16-fc2-nf4-DQ_False-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp16-fc2-fp4-DQ_True-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp16-fc2-fp4-DQ_False-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp16-attn-nf4-DQ_True-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp16-attn-nf4-DQ_False-cuda] PASSED [ 65%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp16-attn-fp4-DQ_True-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp16-attn-fp4-DQ_False-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp16-attn_packed-nf4-DQ_True-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp16-attn_packed-nf4-DQ_False-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp16-attn_packed-fp4-DQ_True-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp16-attn_packed-fp4-DQ_False-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-bf16-fc1-nf4-DQ_True-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-bf16-fc1-nf4-DQ_False-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-bf16-fc1-fp4-DQ_True-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-bf16-fc1-fp4-DQ_False-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-bf16-fc2-nf4-DQ_True-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-bf16-fc2-nf4-DQ_False-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-bf16-fc2-fp4-DQ_True-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-bf16-fc2-fp4-DQ_False-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-bf16-attn-nf4-DQ_True-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-bf16-attn-nf4-DQ_False-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-bf16-attn-fp4-DQ_True-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-bf16-attn-fp4-DQ_False-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-bf16-attn_packed-nf4-DQ_True-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-bf16-attn_packed-nf4-DQ_False-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-bf16-attn_packed-fp4-DQ_True-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-bf16-attn_packed-fp4-DQ_False-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp32-fc1-nf4-DQ_True-cuda] FAILED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp32-fc1-nf4-DQ_False-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp32-fc1-fp4-DQ_True-cuda] FAILED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp32-fc1-fp4-DQ_False-cuda] FAILED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp32-fc2-nf4-DQ_True-cuda] FAILED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp32-fc2-nf4-DQ_False-cuda] FAILED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp32-fc2-fp4-DQ_True-cuda] FAILED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp32-fc2-fp4-DQ_False-cuda] FAILED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp32-attn-nf4-DQ_True-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp32-attn-nf4-DQ_False-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp32-attn-fp4-DQ_True-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp32-attn-fp4-DQ_False-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp32-attn_packed-nf4-DQ_True-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp32-attn_packed-nf4-DQ_False-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp32-attn_packed-fp4-DQ_True-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp16-fp32-attn_packed-fp4-DQ_False-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp16-fc1-nf4-DQ_True-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp16-fc1-nf4-DQ_False-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp16-fc1-fp4-DQ_True-cuda] PASSED [ 66%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp16-fc1-fp4-DQ_False-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp16-fc2-nf4-DQ_True-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp16-fc2-nf4-DQ_False-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp16-fc2-fp4-DQ_True-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp16-fc2-fp4-DQ_False-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp16-attn-nf4-DQ_True-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp16-attn-nf4-DQ_False-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp16-attn-fp4-DQ_True-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp16-attn-fp4-DQ_False-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp16-attn_packed-nf4-DQ_True-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp16-attn_packed-nf4-DQ_False-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp16-attn_packed-fp4-DQ_True-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp16-attn_packed-fp4-DQ_False-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-bf16-fc1-nf4-DQ_True-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-bf16-fc1-nf4-DQ_False-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-bf16-fc1-fp4-DQ_True-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-bf16-fc1-fp4-DQ_False-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-bf16-fc2-nf4-DQ_True-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-bf16-fc2-nf4-DQ_False-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-bf16-fc2-fp4-DQ_True-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-bf16-fc2-fp4-DQ_False-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-bf16-attn-nf4-DQ_True-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-bf16-attn-nf4-DQ_False-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-bf16-attn-fp4-DQ_True-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-bf16-attn-fp4-DQ_False-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-bf16-attn_packed-nf4-DQ_True-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-bf16-attn_packed-nf4-DQ_False-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-bf16-attn_packed-fp4-DQ_True-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-bf16-attn_packed-fp4-DQ_False-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp32-fc1-nf4-DQ_True-cuda] FAILED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp32-fc1-nf4-DQ_False-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp32-fc1-fp4-DQ_True-cuda] FAILED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp32-fc1-fp4-DQ_False-cuda] FAILED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp32-fc2-nf4-DQ_True-cuda] FAILED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp32-fc2-nf4-DQ_False-cuda] FAILED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp32-fc2-fp4-DQ_True-cuda] FAILED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp32-fc2-fp4-DQ_False-cuda] FAILED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp32-attn-nf4-DQ_True-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp32-attn-nf4-DQ_False-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp32-attn-fp4-DQ_True-cuda] PASSED [ 67%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp32-attn-fp4-DQ_False-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp32-attn_packed-nf4-DQ_True-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp32-attn_packed-nf4-DQ_False-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp32-attn_packed-fp4-DQ_True-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-bf16-fp32-attn_packed-fp4-DQ_False-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp16-fc1-nf4-DQ_True-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp16-fc1-nf4-DQ_False-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp16-fc1-fp4-DQ_True-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp16-fc1-fp4-DQ_False-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp16-fc2-nf4-DQ_True-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp16-fc2-nf4-DQ_False-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp16-fc2-fp4-DQ_True-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp16-fc2-fp4-DQ_False-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp16-attn-nf4-DQ_True-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp16-attn-nf4-DQ_False-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp16-attn-fp4-DQ_True-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp16-attn-fp4-DQ_False-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp16-attn_packed-nf4-DQ_True-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp16-attn_packed-nf4-DQ_False-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp16-attn_packed-fp4-DQ_True-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp16-attn_packed-fp4-DQ_False-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-bf16-fc1-nf4-DQ_True-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-bf16-fc1-nf4-DQ_False-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-bf16-fc1-fp4-DQ_True-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-bf16-fc1-fp4-DQ_False-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-bf16-fc2-nf4-DQ_True-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-bf16-fc2-nf4-DQ_False-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-bf16-fc2-fp4-DQ_True-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-bf16-fc2-fp4-DQ_False-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-bf16-attn-nf4-DQ_True-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-bf16-attn-nf4-DQ_False-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-bf16-attn-fp4-DQ_True-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-bf16-attn-fp4-DQ_False-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-bf16-attn_packed-nf4-DQ_True-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-bf16-attn_packed-nf4-DQ_False-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-bf16-attn_packed-fp4-DQ_True-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-bf16-attn_packed-fp4-DQ_False-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp32-fc1-nf4-DQ_True-cuda] FAILED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp32-fc1-nf4-DQ_False-cuda] PASSED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp32-fc1-fp4-DQ_True-cuda] FAILED [ 68%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp32-fc1-fp4-DQ_False-cuda] FAILED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp32-fc2-nf4-DQ_True-cuda] FAILED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp32-fc2-nf4-DQ_False-cuda] FAILED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp32-fc2-fp4-DQ_True-cuda] FAILED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp32-fc2-fp4-DQ_False-cuda] FAILED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp32-attn-nf4-DQ_True-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp32-attn-nf4-DQ_False-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp32-attn-fp4-DQ_True-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp32-attn-fp4-DQ_False-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp32-attn_packed-nf4-DQ_True-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp32-attn_packed-nf4-DQ_False-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp32-attn_packed-fp4-DQ_True-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-fp32-fp32-attn_packed-fp4-DQ_False-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp16-fc1-nf4-DQ_True-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp16-fc1-nf4-DQ_False-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp16-fc1-fp4-DQ_True-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp16-fc1-fp4-DQ_False-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp16-fc2-nf4-DQ_True-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp16-fc2-nf4-DQ_False-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp16-fc2-fp4-DQ_True-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp16-fc2-fp4-DQ_False-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp16-attn-nf4-DQ_True-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp16-attn-nf4-DQ_False-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp16-attn-fp4-DQ_True-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp16-attn-fp4-DQ_False-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp16-attn_packed-nf4-DQ_True-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp16-attn_packed-nf4-DQ_False-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp16-attn_packed-fp4-DQ_True-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp16-attn_packed-fp4-DQ_False-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-bf16-fc1-nf4-DQ_True-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-bf16-fc1-nf4-DQ_False-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-bf16-fc1-fp4-DQ_True-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-bf16-fc1-fp4-DQ_False-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-bf16-fc2-nf4-DQ_True-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-bf16-fc2-nf4-DQ_False-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-bf16-fc2-fp4-DQ_True-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-bf16-fc2-fp4-DQ_False-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-bf16-attn-nf4-DQ_True-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-bf16-attn-nf4-DQ_False-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-bf16-attn-fp4-DQ_True-cuda] PASSED [ 69%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-bf16-attn-fp4-DQ_False-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-bf16-attn_packed-nf4-DQ_True-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-bf16-attn_packed-nf4-DQ_False-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-bf16-attn_packed-fp4-DQ_True-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-bf16-attn_packed-fp4-DQ_False-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp32-fc1-nf4-DQ_True-cuda] FAILED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp32-fc1-nf4-DQ_False-cuda] FAILED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp32-fc1-fp4-DQ_True-cuda] FAILED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp32-fc1-fp4-DQ_False-cuda] FAILED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp32-fc2-nf4-DQ_True-cuda] FAILED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp32-fc2-nf4-DQ_False-cuda] FAILED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp32-fc2-fp4-DQ_True-cuda] FAILED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp32-fc2-fp4-DQ_False-cuda] FAILED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp32-attn-nf4-DQ_True-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp32-attn-nf4-DQ_False-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp32-attn-fp4-DQ_True-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp32-attn-fp4-DQ_False-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp32-attn_packed-nf4-DQ_True-cuda] FAILED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp32-attn_packed-nf4-DQ_False-cuda] FAILED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp32-attn_packed-fp4-DQ_True-cuda] FAILED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-uint8-fp32-attn_packed-fp4-DQ_False-cuda] FAILED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp16-fc1-nf4-DQ_True-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp16-fc1-nf4-DQ_False-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp16-fc1-fp4-DQ_True-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp16-fc1-fp4-DQ_False-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp16-fc2-nf4-DQ_True-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp16-fc2-nf4-DQ_False-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp16-fc2-fp4-DQ_True-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp16-fc2-fp4-DQ_False-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp16-attn-nf4-DQ_True-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp16-attn-nf4-DQ_False-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp16-attn-fp4-DQ_True-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp16-attn-fp4-DQ_False-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp16-attn_packed-nf4-DQ_True-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp16-attn_packed-nf4-DQ_False-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp16-attn_packed-fp4-DQ_True-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp16-attn_packed-fp4-DQ_False-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-bf16-fc1-nf4-DQ_True-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-bf16-fc1-nf4-DQ_False-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-bf16-fc1-fp4-DQ_True-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-bf16-fc1-fp4-DQ_False-cuda] PASSED [ 70%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-bf16-fc2-nf4-DQ_True-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-bf16-fc2-nf4-DQ_False-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-bf16-fc2-fp4-DQ_True-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-bf16-fc2-fp4-DQ_False-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-bf16-attn-nf4-DQ_True-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-bf16-attn-nf4-DQ_False-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-bf16-attn-fp4-DQ_True-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-bf16-attn-fp4-DQ_False-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-bf16-attn_packed-nf4-DQ_True-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-bf16-attn_packed-nf4-DQ_False-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-bf16-attn_packed-fp4-DQ_True-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-bf16-attn_packed-fp4-DQ_False-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp32-fc1-nf4-DQ_True-cuda] FAILED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp32-fc1-nf4-DQ_False-cuda] FAILED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp32-fc1-fp4-DQ_True-cuda] FAILED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp32-fc1-fp4-DQ_False-cuda] FAILED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp32-fc2-nf4-DQ_True-cuda] FAILED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp32-fc2-nf4-DQ_False-cuda] FAILED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp32-fc2-fp4-DQ_True-cuda] FAILED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp32-fc2-fp4-DQ_False-cuda] FAILED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp32-attn-nf4-DQ_True-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp32-attn-nf4-DQ_False-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp32-attn-fp4-DQ_True-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp32-attn-fp4-DQ_False-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp32-attn_packed-nf4-DQ_True-cuda] FAILED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp32-attn_packed-nf4-DQ_False-cuda] FAILED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp32-attn_packed-fp4-DQ_True-cuda] FAILED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp16-fp32-attn_packed-fp4-DQ_False-cuda] FAILED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp16-fc1-nf4-DQ_True-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp16-fc1-nf4-DQ_False-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp16-fc1-fp4-DQ_True-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp16-fc1-fp4-DQ_False-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp16-fc2-nf4-DQ_True-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp16-fc2-nf4-DQ_False-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp16-fc2-fp4-DQ_True-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp16-fc2-fp4-DQ_False-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp16-attn-nf4-DQ_True-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp16-attn-nf4-DQ_False-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp16-attn-fp4-DQ_True-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp16-attn-fp4-DQ_False-cuda] PASSED [ 71%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp16-attn_packed-nf4-DQ_True-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp16-attn_packed-nf4-DQ_False-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp16-attn_packed-fp4-DQ_True-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp16-attn_packed-fp4-DQ_False-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-bf16-fc1-nf4-DQ_True-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-bf16-fc1-nf4-DQ_False-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-bf16-fc1-fp4-DQ_True-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-bf16-fc1-fp4-DQ_False-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-bf16-fc2-nf4-DQ_True-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-bf16-fc2-nf4-DQ_False-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-bf16-fc2-fp4-DQ_True-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-bf16-fc2-fp4-DQ_False-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-bf16-attn-nf4-DQ_True-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-bf16-attn-nf4-DQ_False-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-bf16-attn-fp4-DQ_True-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-bf16-attn-fp4-DQ_False-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-bf16-attn_packed-nf4-DQ_True-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-bf16-attn_packed-nf4-DQ_False-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-bf16-attn_packed-fp4-DQ_True-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-bf16-attn_packed-fp4-DQ_False-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp32-fc1-nf4-DQ_True-cuda] FAILED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp32-fc1-nf4-DQ_False-cuda] FAILED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp32-fc1-fp4-DQ_True-cuda] FAILED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp32-fc1-fp4-DQ_False-cuda] FAILED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp32-fc2-nf4-DQ_True-cuda] FAILED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp32-fc2-nf4-DQ_False-cuda] FAILED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp32-fc2-fp4-DQ_True-cuda] FAILED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp32-fc2-fp4-DQ_False-cuda] FAILED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp32-attn-nf4-DQ_True-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp32-attn-nf4-DQ_False-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp32-attn-fp4-DQ_True-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp32-attn-fp4-DQ_False-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp32-attn_packed-nf4-DQ_True-cuda] FAILED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp32-attn_packed-nf4-DQ_False-cuda] FAILED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp32-attn_packed-fp4-DQ_True-cuda] FAILED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-bf16-fp32-attn_packed-fp4-DQ_False-cuda] FAILED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp16-fc1-nf4-DQ_True-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp16-fc1-nf4-DQ_False-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp16-fc1-fp4-DQ_True-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp16-fc1-fp4-DQ_False-cuda] PASSED [ 72%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp16-fc2-nf4-DQ_True-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp16-fc2-nf4-DQ_False-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp16-fc2-fp4-DQ_True-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp16-fc2-fp4-DQ_False-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp16-attn-nf4-DQ_True-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp16-attn-nf4-DQ_False-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp16-attn-fp4-DQ_True-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp16-attn-fp4-DQ_False-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp16-attn_packed-nf4-DQ_True-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp16-attn_packed-nf4-DQ_False-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp16-attn_packed-fp4-DQ_True-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp16-attn_packed-fp4-DQ_False-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-bf16-fc1-nf4-DQ_True-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-bf16-fc1-nf4-DQ_False-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-bf16-fc1-fp4-DQ_True-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-bf16-fc1-fp4-DQ_False-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-bf16-fc2-nf4-DQ_True-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-bf16-fc2-nf4-DQ_False-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-bf16-fc2-fp4-DQ_True-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-bf16-fc2-fp4-DQ_False-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-bf16-attn-nf4-DQ_True-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-bf16-attn-nf4-DQ_False-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-bf16-attn-fp4-DQ_True-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-bf16-attn-fp4-DQ_False-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-bf16-attn_packed-nf4-DQ_True-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-bf16-attn_packed-nf4-DQ_False-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-bf16-attn_packed-fp4-DQ_True-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-bf16-attn_packed-fp4-DQ_False-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp32-fc1-nf4-DQ_True-cuda] FAILED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp32-fc1-nf4-DQ_False-cuda] FAILED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp32-fc1-fp4-DQ_True-cuda] FAILED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp32-fc1-fp4-DQ_False-cuda] FAILED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp32-fc2-nf4-DQ_True-cuda] FAILED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp32-fc2-nf4-DQ_False-cuda] FAILED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp32-fc2-fp4-DQ_True-cuda] FAILED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp32-fc2-fp4-DQ_False-cuda] FAILED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp32-attn-nf4-DQ_True-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp32-attn-nf4-DQ_False-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp32-attn-fp4-DQ_True-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp32-attn-fp4-DQ_False-cuda] PASSED [ 73%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp32-attn_packed-nf4-DQ_True-cuda] FAILED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp32-attn_packed-nf4-DQ_False-cuda] FAILED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp32-attn_packed-fp4-DQ_True-cuda] FAILED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=512-fp32-fp32-attn_packed-fp4-DQ_False-cuda] FAILED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp16-fc1-nf4-DQ_True-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp16-fc1-nf4-DQ_False-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp16-fc1-fp4-DQ_True-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp16-fc1-fp4-DQ_False-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp16-fc2-nf4-DQ_True-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp16-fc2-nf4-DQ_False-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp16-fc2-fp4-DQ_True-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp16-fc2-fp4-DQ_False-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp16-attn-nf4-DQ_True-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp16-attn-nf4-DQ_False-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp16-attn-fp4-DQ_True-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp16-attn-fp4-DQ_False-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp16-attn_packed-nf4-DQ_True-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp16-attn_packed-nf4-DQ_False-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp16-attn_packed-fp4-DQ_True-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp16-attn_packed-fp4-DQ_False-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-bf16-fc1-nf4-DQ_True-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-bf16-fc1-nf4-DQ_False-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-bf16-fc1-fp4-DQ_True-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-bf16-fc1-fp4-DQ_False-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-bf16-fc2-nf4-DQ_True-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-bf16-fc2-nf4-DQ_False-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-bf16-fc2-fp4-DQ_True-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-bf16-fc2-fp4-DQ_False-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-bf16-attn-nf4-DQ_True-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-bf16-attn-nf4-DQ_False-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-bf16-attn-fp4-DQ_True-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-bf16-attn-fp4-DQ_False-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-bf16-attn_packed-nf4-DQ_True-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-bf16-attn_packed-nf4-DQ_False-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-bf16-attn_packed-fp4-DQ_True-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-bf16-attn_packed-fp4-DQ_False-cuda] PASSED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp32-fc1-nf4-DQ_True-cuda] FAILED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp32-fc1-nf4-DQ_False-cuda] FAILED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp32-fc1-fp4-DQ_True-cuda] FAILED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp32-fc1-fp4-DQ_False-cuda] FAILED [ 74%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp32-fc2-nf4-DQ_True-cuda] FAILED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp32-fc2-nf4-DQ_False-cuda] FAILED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp32-fc2-fp4-DQ_True-cuda] FAILED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp32-fc2-fp4-DQ_False-cuda] FAILED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp32-attn-nf4-DQ_True-cuda] FAILED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp32-attn-nf4-DQ_False-cuda] FAILED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp32-attn-fp4-DQ_True-cuda] FAILED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp32-attn-fp4-DQ_False-cuda] FAILED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp32-attn_packed-nf4-DQ_True-cuda] FAILED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp32-attn_packed-nf4-DQ_False-cuda] FAILED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp32-attn_packed-fp4-DQ_True-cuda] FAILED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-uint8-fp32-attn_packed-fp4-DQ_False-cuda] FAILED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp16-fc1-nf4-DQ_True-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp16-fc1-nf4-DQ_False-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp16-fc1-fp4-DQ_True-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp16-fc1-fp4-DQ_False-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp16-fc2-nf4-DQ_True-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp16-fc2-nf4-DQ_False-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp16-fc2-fp4-DQ_True-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp16-fc2-fp4-DQ_False-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp16-attn-nf4-DQ_True-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp16-attn-nf4-DQ_False-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp16-attn-fp4-DQ_True-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp16-attn-fp4-DQ_False-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp16-attn_packed-nf4-DQ_True-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp16-attn_packed-nf4-DQ_False-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp16-attn_packed-fp4-DQ_True-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp16-attn_packed-fp4-DQ_False-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-bf16-fc1-nf4-DQ_True-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-bf16-fc1-nf4-DQ_False-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-bf16-fc1-fp4-DQ_True-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-bf16-fc1-fp4-DQ_False-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-bf16-fc2-nf4-DQ_True-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-bf16-fc2-nf4-DQ_False-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-bf16-fc2-fp4-DQ_True-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-bf16-fc2-fp4-DQ_False-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-bf16-attn-nf4-DQ_True-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-bf16-attn-nf4-DQ_False-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-bf16-attn-fp4-DQ_True-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-bf16-attn-fp4-DQ_False-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-bf16-attn_packed-nf4-DQ_True-cuda] PASSED [ 75%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-bf16-attn_packed-nf4-DQ_False-cuda] PASSED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-bf16-attn_packed-fp4-DQ_True-cuda] PASSED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-bf16-attn_packed-fp4-DQ_False-cuda] PASSED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp32-fc1-nf4-DQ_True-cuda] FAILED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp32-fc1-nf4-DQ_False-cuda] FAILED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp32-fc1-fp4-DQ_True-cuda] FAILED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp32-fc1-fp4-DQ_False-cuda] FAILED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp32-fc2-nf4-DQ_True-cuda] FAILED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp32-fc2-nf4-DQ_False-cuda] FAILED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp32-fc2-fp4-DQ_True-cuda] FAILED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp32-fc2-fp4-DQ_False-cuda] FAILED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp32-attn-nf4-DQ_True-cuda] FAILED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp32-attn-nf4-DQ_False-cuda] FAILED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp32-attn-fp4-DQ_True-cuda] FAILED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp32-attn-fp4-DQ_False-cuda] FAILED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp32-attn_packed-nf4-DQ_True-cuda] FAILED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp32-attn_packed-nf4-DQ_False-cuda] FAILED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp32-attn_packed-fp4-DQ_True-cuda] FAILED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp16-fp32-attn_packed-fp4-DQ_False-cuda] FAILED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp16-fc1-nf4-DQ_True-cuda] PASSED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp16-fc1-nf4-DQ_False-cuda] PASSED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp16-fc1-fp4-DQ_True-cuda] PASSED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp16-fc1-fp4-DQ_False-cuda] PASSED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp16-fc2-nf4-DQ_True-cuda] PASSED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp16-fc2-nf4-DQ_False-cuda] PASSED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp16-fc2-fp4-DQ_True-cuda] PASSED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp16-fc2-fp4-DQ_False-cuda] PASSED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp16-attn-nf4-DQ_True-cuda] PASSED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp16-attn-nf4-DQ_False-cuda] PASSED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp16-attn-fp4-DQ_True-cuda] PASSED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp16-attn-fp4-DQ_False-cuda] PASSED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp16-attn_packed-nf4-DQ_True-cuda] PASSED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp16-attn_packed-nf4-DQ_False-cuda] PASSED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp16-attn_packed-fp4-DQ_True-cuda] PASSED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp16-attn_packed-fp4-DQ_False-cuda] PASSED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-bf16-fc1-nf4-DQ_True-cuda] PASSED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-bf16-fc1-nf4-DQ_False-cuda] PASSED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-bf16-fc1-fp4-DQ_True-cuda] PASSED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-bf16-fc1-fp4-DQ_False-cuda] PASSED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-bf16-fc2-nf4-DQ_True-cuda] PASSED [ 76%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-bf16-fc2-nf4-DQ_False-cuda] PASSED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-bf16-fc2-fp4-DQ_True-cuda] PASSED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-bf16-fc2-fp4-DQ_False-cuda] PASSED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-bf16-attn-nf4-DQ_True-cuda] PASSED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-bf16-attn-nf4-DQ_False-cuda] PASSED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-bf16-attn-fp4-DQ_True-cuda] PASSED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-bf16-attn-fp4-DQ_False-cuda] PASSED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-bf16-attn_packed-nf4-DQ_True-cuda] PASSED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-bf16-attn_packed-nf4-DQ_False-cuda] PASSED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-bf16-attn_packed-fp4-DQ_True-cuda] PASSED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-bf16-attn_packed-fp4-DQ_False-cuda] PASSED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp32-fc1-nf4-DQ_True-cuda] FAILED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp32-fc1-nf4-DQ_False-cuda] FAILED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp32-fc1-fp4-DQ_True-cuda] FAILED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp32-fc1-fp4-DQ_False-cuda] FAILED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp32-fc2-nf4-DQ_True-cuda] FAILED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp32-fc2-nf4-DQ_False-cuda] FAILED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp32-fc2-fp4-DQ_True-cuda] FAILED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp32-fc2-fp4-DQ_False-cuda] FAILED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp32-attn-nf4-DQ_True-cuda] FAILED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp32-attn-nf4-DQ_False-cuda] FAILED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp32-attn-fp4-DQ_True-cuda] FAILED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp32-attn-fp4-DQ_False-cuda] FAILED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp32-attn_packed-nf4-DQ_True-cuda] FAILED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp32-attn_packed-nf4-DQ_False-cuda] FAILED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp32-attn_packed-fp4-DQ_True-cuda] FAILED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-bf16-fp32-attn_packed-fp4-DQ_False-cuda] FAILED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp16-fc1-nf4-DQ_True-cuda] PASSED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp16-fc1-nf4-DQ_False-cuda] PASSED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp16-fc1-fp4-DQ_True-cuda] PASSED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp16-fc1-fp4-DQ_False-cuda] PASSED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp16-fc2-nf4-DQ_True-cuda] PASSED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp16-fc2-nf4-DQ_False-cuda] PASSED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp16-fc2-fp4-DQ_True-cuda] PASSED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp16-fc2-fp4-DQ_False-cuda] PASSED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp16-attn-nf4-DQ_True-cuda] PASSED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp16-attn-nf4-DQ_False-cuda] PASSED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp16-attn-fp4-DQ_True-cuda] PASSED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp16-attn-fp4-DQ_False-cuda] PASSED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp16-attn_packed-nf4-DQ_True-cuda] PASSED [ 77%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp16-attn_packed-nf4-DQ_False-cuda] PASSED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp16-attn_packed-fp4-DQ_True-cuda] PASSED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp16-attn_packed-fp4-DQ_False-cuda] PASSED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-bf16-fc1-nf4-DQ_True-cuda] PASSED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-bf16-fc1-nf4-DQ_False-cuda] PASSED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-bf16-fc1-fp4-DQ_True-cuda] PASSED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-bf16-fc1-fp4-DQ_False-cuda] PASSED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-bf16-fc2-nf4-DQ_True-cuda] PASSED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-bf16-fc2-nf4-DQ_False-cuda] PASSED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-bf16-fc2-fp4-DQ_True-cuda] PASSED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-bf16-fc2-fp4-DQ_False-cuda] PASSED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-bf16-attn-nf4-DQ_True-cuda] PASSED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-bf16-attn-nf4-DQ_False-cuda] PASSED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-bf16-attn-fp4-DQ_True-cuda] PASSED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-bf16-attn-fp4-DQ_False-cuda] PASSED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-bf16-attn_packed-nf4-DQ_True-cuda] PASSED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-bf16-attn_packed-nf4-DQ_False-cuda] PASSED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-bf16-attn_packed-fp4-DQ_True-cuda] PASSED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-bf16-attn_packed-fp4-DQ_False-cuda] PASSED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp32-fc1-nf4-DQ_True-cuda] FAILED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp32-fc1-nf4-DQ_False-cuda] FAILED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp32-fc1-fp4-DQ_True-cuda] FAILED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp32-fc1-fp4-DQ_False-cuda] FAILED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp32-fc2-nf4-DQ_True-cuda] FAILED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp32-fc2-nf4-DQ_False-cuda] FAILED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp32-fc2-fp4-DQ_True-cuda] FAILED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp32-fc2-fp4-DQ_False-cuda] FAILED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp32-attn-nf4-DQ_True-cuda] FAILED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp32-attn-nf4-DQ_False-cuda] FAILED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp32-attn-fp4-DQ_True-cuda] FAILED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp32-attn-fp4-DQ_False-cuda] FAILED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp32-attn_packed-nf4-DQ_True-cuda] FAILED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp32-attn_packed-nf4-DQ_False-cuda] FAILED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp32-attn_packed-fp4-DQ_True-cuda] FAILED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=1024-fp32-fp32-attn_packed-fp4-DQ_False-cuda] FAILED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_eye_4bit[fp16-nf4-cuda] PASSED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_eye_4bit[fp16-fp4-cuda] PASSED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_eye_4bit[bf16-nf4-cuda] PASSED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_eye_4bit[bf16-fp4-cuda] PASSED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_eye_4bit[fp32-nf4-cuda] PASSED [ 78%]
tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_eye_4bit[fp32-fp4-cuda] PASSED [ 79%]
tests/test_functional.py::test_normal_map_tree PASSED                    [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=T-bias=T-original_dtype0-uint8-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=T-bias=T-original_dtype0-float16-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=T-bias=T-original_dtype0-bfloat16-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=T-bias=T-original_dtype0-float32-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=T-bias=T-original_dtype1-uint8-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=T-bias=T-original_dtype1-float16-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=T-bias=T-original_dtype1-bfloat16-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=T-bias=T-original_dtype1-float32-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=T-bias=F-original_dtype0-uint8-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=T-bias=F-original_dtype0-float16-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=T-bias=F-original_dtype0-bfloat16-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=T-bias=F-original_dtype0-float32-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=T-bias=F-original_dtype1-uint8-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=T-bias=F-original_dtype1-float16-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=T-bias=F-original_dtype1-bfloat16-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=T-bias=F-original_dtype1-float32-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=F-bias=T-original_dtype0-uint8-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=F-bias=T-original_dtype0-float16-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=F-bias=T-original_dtype0-bfloat16-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=F-bias=T-original_dtype0-float32-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=F-bias=T-original_dtype1-uint8-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=F-bias=T-original_dtype1-float16-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=F-bias=T-original_dtype1-bfloat16-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=F-bias=T-original_dtype1-float32-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=F-bias=F-original_dtype0-uint8-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=F-bias=F-original_dtype0-float16-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=F-bias=F-original_dtype0-bfloat16-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=F-bias=F-original_dtype0-float32-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=F-bias=F-original_dtype1-uint8-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=F-bias=F-original_dtype1-float16-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=F-bias=F-original_dtype1-bfloat16-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-nf4-compress_statistics=F-bias=F-original_dtype1-float32-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=T-bias=T-original_dtype0-uint8-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=T-bias=T-original_dtype0-float16-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=T-bias=T-original_dtype0-bfloat16-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=T-bias=T-original_dtype0-float32-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=T-bias=T-original_dtype1-uint8-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=T-bias=T-original_dtype1-float16-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=T-bias=T-original_dtype1-bfloat16-cuda] PASSED [ 79%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=T-bias=T-original_dtype1-float32-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=T-bias=F-original_dtype0-uint8-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=T-bias=F-original_dtype0-float16-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=T-bias=F-original_dtype0-bfloat16-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=T-bias=F-original_dtype0-float32-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=T-bias=F-original_dtype1-uint8-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=T-bias=F-original_dtype1-float16-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=T-bias=F-original_dtype1-bfloat16-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=T-bias=F-original_dtype1-float32-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=F-bias=T-original_dtype0-uint8-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=F-bias=T-original_dtype0-float16-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=F-bias=T-original_dtype0-bfloat16-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=F-bias=T-original_dtype0-float32-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=F-bias=T-original_dtype1-uint8-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=F-bias=T-original_dtype1-float16-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=F-bias=T-original_dtype1-bfloat16-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=F-bias=T-original_dtype1-float32-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=F-bias=F-original_dtype0-uint8-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=F-bias=F-original_dtype0-float16-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=F-bias=F-original_dtype0-bfloat16-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=F-bias=F-original_dtype0-float32-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=F-bias=F-original_dtype1-uint8-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=F-bias=F-original_dtype1-float16-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=F-bias=F-original_dtype1-bfloat16-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=T-fp4-compress_statistics=F-bias=F-original_dtype1-float32-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=T-bias=T-original_dtype0-uint8-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=T-bias=T-original_dtype0-float16-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=T-bias=T-original_dtype0-bfloat16-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=T-bias=T-original_dtype0-float32-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=T-bias=T-original_dtype1-uint8-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=T-bias=T-original_dtype1-float16-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=T-bias=T-original_dtype1-bfloat16-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=T-bias=T-original_dtype1-float32-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=T-bias=F-original_dtype0-uint8-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=T-bias=F-original_dtype0-float16-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=T-bias=F-original_dtype0-bfloat16-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=T-bias=F-original_dtype0-float32-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=T-bias=F-original_dtype1-uint8-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=T-bias=F-original_dtype1-float16-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=T-bias=F-original_dtype1-bfloat16-cuda] PASSED [ 80%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=T-bias=F-original_dtype1-float32-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=F-bias=T-original_dtype0-uint8-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=F-bias=T-original_dtype0-float16-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=F-bias=T-original_dtype0-bfloat16-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=F-bias=T-original_dtype0-float32-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=F-bias=T-original_dtype1-uint8-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=F-bias=T-original_dtype1-float16-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=F-bias=T-original_dtype1-bfloat16-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=F-bias=T-original_dtype1-float32-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=F-bias=F-original_dtype0-uint8-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=F-bias=F-original_dtype0-float16-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=F-bias=F-original_dtype0-bfloat16-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=F-bias=F-original_dtype0-float32-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=F-bias=F-original_dtype1-uint8-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=F-bias=F-original_dtype1-float16-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=F-bias=F-original_dtype1-bfloat16-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-nf4-compress_statistics=F-bias=F-original_dtype1-float32-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=T-bias=T-original_dtype0-uint8-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=T-bias=T-original_dtype0-float16-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=T-bias=T-original_dtype0-bfloat16-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=T-bias=T-original_dtype0-float32-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=T-bias=T-original_dtype1-uint8-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=T-bias=T-original_dtype1-float16-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=T-bias=T-original_dtype1-bfloat16-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=T-bias=T-original_dtype1-float32-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=T-bias=F-original_dtype0-uint8-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=T-bias=F-original_dtype0-float16-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=T-bias=F-original_dtype0-bfloat16-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=T-bias=F-original_dtype0-float32-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=T-bias=F-original_dtype1-uint8-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=T-bias=F-original_dtype1-float16-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=T-bias=F-original_dtype1-bfloat16-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=T-bias=F-original_dtype1-float32-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=F-bias=T-original_dtype0-uint8-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=F-bias=T-original_dtype0-float16-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=F-bias=T-original_dtype0-bfloat16-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=F-bias=T-original_dtype0-float32-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=F-bias=T-original_dtype1-uint8-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=F-bias=T-original_dtype1-float16-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=F-bias=T-original_dtype1-bfloat16-cuda] PASSED [ 81%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=F-bias=T-original_dtype1-float32-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=F-bias=F-original_dtype0-uint8-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=F-bias=F-original_dtype0-float16-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=F-bias=F-original_dtype0-bfloat16-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=F-bias=F-original_dtype0-float32-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=F-bias=F-original_dtype1-uint8-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=F-bias=F-original_dtype1-float16-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=F-bias=F-original_dtype1-bfloat16-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_linear_serialization[save_before_forward=F-fp4-compress_statistics=F-bias=F-original_dtype1-float32-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_copy_param[compress_statistics=T-64-nf4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_copy_param[compress_statistics=T-64-fp4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_copy_param[compress_statistics=T-128-nf4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_copy_param[compress_statistics=T-128-fp4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_copy_param[compress_statistics=F-64-nf4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_copy_param[compress_statistics=F-64-fp4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_copy_param[compress_statistics=F-128-nf4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_copy_param[compress_statistics=F-128-fp4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_params4bit_torch_chunk_split[nf4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_params4bit_torch_chunk_split[fp4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_deepcopy_param[compress_statistics=T-64-nf4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_deepcopy_param[compress_statistics=T-64-fp4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_deepcopy_param[compress_statistics=T-128-nf4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_deepcopy_param[compress_statistics=T-128-fp4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_deepcopy_param[compress_statistics=F-64-nf4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_deepcopy_param[compress_statistics=F-64-fp4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_deepcopy_param[compress_statistics=F-128-nf4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_deepcopy_param[compress_statistics=F-128-fp4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_params4bit_real_serialization[compress_statistics=T-64-nf4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_params4bit_real_serialization[compress_statistics=T-64-fp4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_params4bit_real_serialization[compress_statistics=T-128-nf4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_params4bit_real_serialization[compress_statistics=T-128-fp4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_params4bit_real_serialization[compress_statistics=F-64-nf4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_params4bit_real_serialization[compress_statistics=F-64-fp4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_params4bit_real_serialization[compress_statistics=F-128-nf4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_params4bit_real_serialization[compress_statistics=F-128-fp4-cuda] PASSED [ 82%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=T-bias=T-compress_statistics=T-bf16-nf4-cuda] SKIPPED [ 82%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=T-bias=T-compress_statistics=T-bf16-fp4-cuda] SKIPPED [ 82%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=T-bias=T-compress_statistics=T-fp32-nf4-cuda] SKIPPED [ 82%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=T-bias=T-compress_statistics=T-fp32-fp4-cuda] SKIPPED [ 82%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=T-bias=T-compress_statistics=F-bf16-nf4-cuda] SKIPPED [ 82%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=T-bias=T-compress_statistics=F-bf16-fp4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=T-bias=T-compress_statistics=F-fp32-nf4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=T-bias=T-compress_statistics=F-fp32-fp4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=T-bias=F-compress_statistics=T-bf16-nf4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=T-bias=F-compress_statistics=T-bf16-fp4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=T-bias=F-compress_statistics=T-fp32-nf4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=T-bias=F-compress_statistics=T-fp32-fp4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=T-bias=F-compress_statistics=F-bf16-nf4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=T-bias=F-compress_statistics=F-bf16-fp4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=T-bias=F-compress_statistics=F-fp32-nf4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=T-bias=F-compress_statistics=F-fp32-fp4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=F-bias=T-compress_statistics=T-bf16-nf4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=F-bias=T-compress_statistics=T-bf16-fp4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=F-bias=T-compress_statistics=T-fp32-nf4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=F-bias=T-compress_statistics=T-fp32-fp4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=F-bias=T-compress_statistics=F-bf16-nf4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=F-bias=T-compress_statistics=F-bf16-fp4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=F-bias=T-compress_statistics=F-fp32-nf4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=F-bias=T-compress_statistics=F-fp32-fp4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=F-bias=F-compress_statistics=T-bf16-nf4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=F-bias=F-compress_statistics=T-bf16-fp4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=F-bias=F-compress_statistics=T-fp32-nf4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=F-bias=F-compress_statistics=T-fp32-fp4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=F-bias=F-compress_statistics=F-bf16-nf4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=F-bias=F-compress_statistics=F-bf16-fp4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=F-bias=F-compress_statistics=F-fp32-nf4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=default-fullgraph=F-bias=F-compress_statistics=F-fp32-fp4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=T-bias=T-compress_statistics=T-bf16-nf4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=T-bias=T-compress_statistics=T-bf16-fp4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=T-bias=T-compress_statistics=T-fp32-nf4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=T-bias=T-compress_statistics=T-fp32-fp4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=T-bias=T-compress_statistics=F-bf16-nf4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=T-bias=T-compress_statistics=F-bf16-fp4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=T-bias=T-compress_statistics=F-fp32-nf4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=T-bias=T-compress_statistics=F-fp32-fp4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=T-bias=F-compress_statistics=T-bf16-nf4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=T-bias=F-compress_statistics=T-bf16-fp4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=T-bias=F-compress_statistics=T-fp32-nf4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=T-bias=F-compress_statistics=T-fp32-fp4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=T-bias=F-compress_statistics=F-bf16-nf4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=T-bias=F-compress_statistics=F-bf16-fp4-cuda] SKIPPED [ 83%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=T-bias=F-compress_statistics=F-fp32-nf4-cuda] SKIPPED [ 84%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=T-bias=F-compress_statistics=F-fp32-fp4-cuda] SKIPPED [ 84%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=F-bias=T-compress_statistics=T-bf16-nf4-cuda] SKIPPED [ 84%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=F-bias=T-compress_statistics=T-bf16-fp4-cuda] SKIPPED [ 84%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=F-bias=T-compress_statistics=T-fp32-nf4-cuda] SKIPPED [ 84%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=F-bias=T-compress_statistics=T-fp32-fp4-cuda] SKIPPED [ 84%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=F-bias=T-compress_statistics=F-bf16-nf4-cuda] SKIPPED [ 84%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=F-bias=T-compress_statistics=F-bf16-fp4-cuda] SKIPPED [ 84%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=F-bias=T-compress_statistics=F-fp32-nf4-cuda] SKIPPED [ 84%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=F-bias=T-compress_statistics=F-fp32-fp4-cuda] SKIPPED [ 84%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=F-bias=F-compress_statistics=T-bf16-nf4-cuda] SKIPPED [ 84%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=F-bias=F-compress_statistics=T-bf16-fp4-cuda] SKIPPED [ 84%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=F-bias=F-compress_statistics=T-fp32-nf4-cuda] SKIPPED [ 84%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=F-bias=F-compress_statistics=T-fp32-fp4-cuda] SKIPPED [ 84%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=F-bias=F-compress_statistics=F-bf16-nf4-cuda] SKIPPED [ 84%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=F-bias=F-compress_statistics=F-bf16-fp4-cuda] SKIPPED [ 84%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=F-bias=F-compress_statistics=F-fp32-nf4-cuda] SKIPPED [ 84%]
tests/test_linear4bit.py::test_linear4bit_torch_compile[mode=reduce-overhead-fullgraph=F-bias=F-compress_statistics=F-fp32-fp4-cuda] SKIPPED [ 84%]
tests/test_linear8bitlt.py::test_linear_no_igemmlt[cuda] PASSED          [ 84%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=T-deserialize_before_cuda=T-serialize_before_forward=T-threshold=0.0-has_fp16_weights=T-cuda] PASSED [ 84%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=T-deserialize_before_cuda=T-serialize_before_forward=T-threshold=0.0-has_fp16_weights=F-cuda] PASSED [ 84%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=T-deserialize_before_cuda=T-serialize_before_forward=T-threshold=6.0-has_fp16_weights=T-cuda] PASSED [ 84%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=T-deserialize_before_cuda=T-serialize_before_forward=T-threshold=6.0-has_fp16_weights=F-cuda] PASSED [ 84%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=T-deserialize_before_cuda=T-serialize_before_forward=F-threshold=0.0-has_fp16_weights=T-cuda] PASSED [ 84%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=T-deserialize_before_cuda=T-serialize_before_forward=F-threshold=0.0-has_fp16_weights=F-cuda] PASSED [ 84%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=T-deserialize_before_cuda=T-serialize_before_forward=F-threshold=6.0-has_fp16_weights=T-cuda] PASSED [ 84%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=T-deserialize_before_cuda=T-serialize_before_forward=F-threshold=6.0-has_fp16_weights=F-cuda] PASSED [ 84%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=T-deserialize_before_cuda=F-serialize_before_forward=T-threshold=0.0-has_fp16_weights=T-cuda] PASSED [ 84%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=T-deserialize_before_cuda=F-serialize_before_forward=T-threshold=0.0-has_fp16_weights=F-cuda] PASSED [ 84%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=T-deserialize_before_cuda=F-serialize_before_forward=T-threshold=6.0-has_fp16_weights=T-cuda] PASSED [ 84%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=T-deserialize_before_cuda=F-serialize_before_forward=T-threshold=6.0-has_fp16_weights=F-cuda] PASSED [ 84%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=T-deserialize_before_cuda=F-serialize_before_forward=F-threshold=0.0-has_fp16_weights=T-cuda] PASSED [ 84%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=T-deserialize_before_cuda=F-serialize_before_forward=F-threshold=0.0-has_fp16_weights=F-cuda] PASSED [ 84%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=T-deserialize_before_cuda=F-serialize_before_forward=F-threshold=6.0-has_fp16_weights=T-cuda] PASSED [ 84%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=T-deserialize_before_cuda=F-serialize_before_forward=F-threshold=6.0-has_fp16_weights=F-cuda] PASSED [ 84%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=F-deserialize_before_cuda=T-serialize_before_forward=T-threshold=0.0-has_fp16_weights=T-cuda] PASSED [ 84%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=F-deserialize_before_cuda=T-serialize_before_forward=T-threshold=0.0-has_fp16_weights=F-cuda] PASSED [ 84%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=F-deserialize_before_cuda=T-serialize_before_forward=T-threshold=6.0-has_fp16_weights=T-cuda] PASSED [ 84%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=F-deserialize_before_cuda=T-serialize_before_forward=T-threshold=6.0-has_fp16_weights=F-cuda] PASSED [ 84%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=F-deserialize_before_cuda=T-serialize_before_forward=F-threshold=0.0-has_fp16_weights=T-cuda] PASSED [ 84%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=F-deserialize_before_cuda=T-serialize_before_forward=F-threshold=0.0-has_fp16_weights=F-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=F-deserialize_before_cuda=T-serialize_before_forward=F-threshold=6.0-has_fp16_weights=T-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=F-deserialize_before_cuda=T-serialize_before_forward=F-threshold=6.0-has_fp16_weights=F-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=F-deserialize_before_cuda=F-serialize_before_forward=T-threshold=0.0-has_fp16_weights=T-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=F-deserialize_before_cuda=F-serialize_before_forward=T-threshold=0.0-has_fp16_weights=F-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=F-deserialize_before_cuda=F-serialize_before_forward=T-threshold=6.0-has_fp16_weights=T-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=F-deserialize_before_cuda=F-serialize_before_forward=T-threshold=6.0-has_fp16_weights=F-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=F-deserialize_before_cuda=F-serialize_before_forward=F-threshold=0.0-has_fp16_weights=T-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=F-deserialize_before_cuda=F-serialize_before_forward=F-threshold=0.0-has_fp16_weights=F-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=F-deserialize_before_cuda=F-serialize_before_forward=F-threshold=6.0-has_fp16_weights=T-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=T-save_before_forward=F-deserialize_before_cuda=F-serialize_before_forward=F-threshold=6.0-has_fp16_weights=F-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=T-deserialize_before_cuda=T-serialize_before_forward=T-threshold=0.0-has_fp16_weights=T-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=T-deserialize_before_cuda=T-serialize_before_forward=T-threshold=0.0-has_fp16_weights=F-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=T-deserialize_before_cuda=T-serialize_before_forward=T-threshold=6.0-has_fp16_weights=T-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=T-deserialize_before_cuda=T-serialize_before_forward=T-threshold=6.0-has_fp16_weights=F-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=T-deserialize_before_cuda=T-serialize_before_forward=F-threshold=0.0-has_fp16_weights=T-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=T-deserialize_before_cuda=T-serialize_before_forward=F-threshold=0.0-has_fp16_weights=F-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=T-deserialize_before_cuda=T-serialize_before_forward=F-threshold=6.0-has_fp16_weights=T-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=T-deserialize_before_cuda=T-serialize_before_forward=F-threshold=6.0-has_fp16_weights=F-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=T-deserialize_before_cuda=F-serialize_before_forward=T-threshold=0.0-has_fp16_weights=T-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=T-deserialize_before_cuda=F-serialize_before_forward=T-threshold=0.0-has_fp16_weights=F-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=T-deserialize_before_cuda=F-serialize_before_forward=T-threshold=6.0-has_fp16_weights=T-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=T-deserialize_before_cuda=F-serialize_before_forward=T-threshold=6.0-has_fp16_weights=F-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=T-deserialize_before_cuda=F-serialize_before_forward=F-threshold=0.0-has_fp16_weights=T-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=T-deserialize_before_cuda=F-serialize_before_forward=F-threshold=0.0-has_fp16_weights=F-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=T-deserialize_before_cuda=F-serialize_before_forward=F-threshold=6.0-has_fp16_weights=T-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=T-deserialize_before_cuda=F-serialize_before_forward=F-threshold=6.0-has_fp16_weights=F-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=F-deserialize_before_cuda=T-serialize_before_forward=T-threshold=0.0-has_fp16_weights=T-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=F-deserialize_before_cuda=T-serialize_before_forward=T-threshold=0.0-has_fp16_weights=F-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=F-deserialize_before_cuda=T-serialize_before_forward=T-threshold=6.0-has_fp16_weights=T-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=F-deserialize_before_cuda=T-serialize_before_forward=T-threshold=6.0-has_fp16_weights=F-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=F-deserialize_before_cuda=T-serialize_before_forward=F-threshold=0.0-has_fp16_weights=T-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=F-deserialize_before_cuda=T-serialize_before_forward=F-threshold=0.0-has_fp16_weights=F-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=F-deserialize_before_cuda=T-serialize_before_forward=F-threshold=6.0-has_fp16_weights=T-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=F-deserialize_before_cuda=T-serialize_before_forward=F-threshold=6.0-has_fp16_weights=F-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=F-deserialize_before_cuda=F-serialize_before_forward=T-threshold=0.0-has_fp16_weights=T-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=F-deserialize_before_cuda=F-serialize_before_forward=T-threshold=0.0-has_fp16_weights=F-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=F-deserialize_before_cuda=F-serialize_before_forward=T-threshold=6.0-has_fp16_weights=T-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=F-deserialize_before_cuda=F-serialize_before_forward=T-threshold=6.0-has_fp16_weights=F-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=F-deserialize_before_cuda=F-serialize_before_forward=F-threshold=0.0-has_fp16_weights=T-cuda] PASSED [ 85%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=F-deserialize_before_cuda=F-serialize_before_forward=F-threshold=0.0-has_fp16_weights=F-cuda] PASSED [ 86%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=F-deserialize_before_cuda=F-serialize_before_forward=F-threshold=6.0-has_fp16_weights=T-cuda] PASSED [ 86%]
tests/test_linear8bitlt.py::test_linear_serialization[load_before_cuda=F-save_before_forward=F-deserialize_before_cuda=F-serialize_before_forward=F-threshold=6.0-has_fp16_weights=F-cuda] PASSED [ 86%]
tests/test_linear8bitlt.py::test_linear8bit_copy_param PASSED            [ 86%]
tests/test_linear8bitlt.py::test_linear8bit_deepcopy_param PASSED        [ 86%]
tests/test_linear8bitlt.py::test_linear8bit_serialization PASSED         [ 86%]
tests/test_linear8bitlt.py::test_linear8bitlt_torch_compile[mode=default-fullgraph=T-bias=T-threshold=0.0-cuda] SKIPPED [ 86%]
tests/test_linear8bitlt.py::test_linear8bitlt_torch_compile[mode=default-fullgraph=T-bias=T-threshold=6.0-cuda] SKIPPED [ 86%]
tests/test_linear8bitlt.py::test_linear8bitlt_torch_compile[mode=default-fullgraph=T-bias=F-threshold=0.0-cuda] SKIPPED [ 86%]
tests/test_linear8bitlt.py::test_linear8bitlt_torch_compile[mode=default-fullgraph=T-bias=F-threshold=6.0-cuda] SKIPPED [ 86%]
tests/test_linear8bitlt.py::test_linear8bitlt_torch_compile[mode=default-fullgraph=F-bias=T-threshold=0.0-cuda] SKIPPED [ 86%]
tests/test_linear8bitlt.py::test_linear8bitlt_torch_compile[mode=default-fullgraph=F-bias=T-threshold=6.0-cuda] SKIPPED [ 86%]
tests/test_linear8bitlt.py::test_linear8bitlt_torch_compile[mode=default-fullgraph=F-bias=F-threshold=0.0-cuda] SKIPPED [ 86%]
tests/test_linear8bitlt.py::test_linear8bitlt_torch_compile[mode=default-fullgraph=F-bias=F-threshold=6.0-cuda] SKIPPED [ 86%]
tests/test_linear8bitlt.py::test_linear8bitlt_torch_compile[mode=reduce-overhead-fullgraph=T-bias=T-threshold=0.0-cuda] SKIPPED [ 86%]
tests/test_linear8bitlt.py::test_linear8bitlt_torch_compile[mode=reduce-overhead-fullgraph=T-bias=T-threshold=6.0-cuda] SKIPPED [ 86%]
tests/test_linear8bitlt.py::test_linear8bitlt_torch_compile[mode=reduce-overhead-fullgraph=T-bias=F-threshold=0.0-cuda] SKIPPED [ 86%]
tests/test_linear8bitlt.py::test_linear8bitlt_torch_compile[mode=reduce-overhead-fullgraph=T-bias=F-threshold=6.0-cuda] SKIPPED [ 86%]
tests/test_linear8bitlt.py::test_linear8bitlt_torch_compile[mode=reduce-overhead-fullgraph=F-bias=T-threshold=0.0-cuda] SKIPPED [ 86%]
tests/test_linear8bitlt.py::test_linear8bitlt_torch_compile[mode=reduce-overhead-fullgraph=F-bias=T-threshold=6.0-cuda] SKIPPED [ 86%]
tests/test_linear8bitlt.py::test_linear8bitlt_torch_compile[mode=reduce-overhead-fullgraph=F-bias=F-threshold=0.0-cuda] SKIPPED [ 86%]
tests/test_linear8bitlt.py::test_linear8bitlt_torch_compile[mode=reduce-overhead-fullgraph=F-bias=F-threshold=6.0-cuda] SKIPPED [ 86%]
tests/test_linear8bitlt.py::test_linear8bitlt_device_movement[cuda] PASSED [ 86%]
tests/test_modules.py::test_linear8bitlt_inference[threshold=0.0-cuda] PASSED [ 86%]
tests/test_modules.py::test_linear8bitlt_inference[threshold=3.0-cuda] PASSED [ 86%]
tests/test_modules.py::test_linear8bitlt_accumulated_gradient[cuda] PASSED [ 86%]
tests/test_modules.py::test_linear8bitlt_no_fp16_weights[0.0-cuda] PASSED [ 86%]
tests/test_modules.py::test_linear8bitlt_no_fp16_weights[2.0-cuda] PASSED [ 86%]
tests/test_modules.py::test_linear_kbit_fp32_bias[Int8Lt-cuda] PASSED    [ 86%]
tests/test_modules.py::test_linear_kbit_fp32_bias[NF4-cuda] PASSED       [ 86%]
tests/test_modules.py::test_kbit_backprop[dtype0-Int8Lt-cuda] PASSED     [ 86%]
tests/test_modules.py::test_kbit_backprop[dtype0-4bit-cuda] PASSED       [ 86%]
tests/test_modules.py::test_kbit_backprop[dtype0-FP4-cuda] PASSED        [ 86%]
tests/test_modules.py::test_kbit_backprop[dtype0-NF4-cuda] PASSED        [ 86%]
tests/test_modules.py::test_kbit_backprop[dtype0-FP4+C-cuda] PASSED      [ 86%]
tests/test_modules.py::test_kbit_backprop[dtype0-NF4+C-cuda] PASSED      [ 86%]
tests/test_modules.py::test_kbit_backprop[dtype0-NF4+fp32-cuda] PASSED   [ 86%]
tests/test_modules.py::test_kbit_backprop[dtype0-NF4+fp16-cuda] PASSED   [ 86%]
tests/test_modules.py::test_kbit_backprop[dtype0-NF4+bf16-cuda] PASSED   [ 86%]
tests/test_modules.py::test_kbit_backprop[dtype1-Int8Lt-cuda] PASSED     [ 86%]
tests/test_modules.py::test_kbit_backprop[dtype1-4bit-cuda] PASSED       [ 87%]
tests/test_modules.py::test_kbit_backprop[dtype1-FP4-cuda] PASSED        [ 87%]
tests/test_modules.py::test_kbit_backprop[dtype1-NF4-cuda] PASSED        [ 87%]
tests/test_modules.py::test_kbit_backprop[dtype1-FP4+C-cuda] PASSED      [ 87%]
tests/test_modules.py::test_kbit_backprop[dtype1-NF4+C-cuda] PASSED      [ 87%]
tests/test_modules.py::test_kbit_backprop[dtype1-NF4+fp32-cuda] PASSED   [ 87%]
tests/test_modules.py::test_kbit_backprop[dtype1-NF4+fp16-cuda] PASSED   [ 87%]
tests/test_modules.py::test_kbit_backprop[dtype1-NF4+bf16-cuda] PASSED   [ 87%]
tests/test_modules.py::test_embedding_lossless[Embedding8bit-None-(10,)-64-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[Embedding8bit-None-(10,)-65-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[Embedding8bit-None-(10, 10)-64-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[Embedding8bit-None-(10, 10)-65-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[Embedding8bit-None-(10, 10, 10)-64-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[Embedding8bit-None-(10, 10, 10)-65-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[EmbeddingFP4-torch.uint8-(10,)-64-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[EmbeddingFP4-torch.uint8-(10,)-65-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[EmbeddingFP4-torch.uint8-(10, 10)-64-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[EmbeddingFP4-torch.uint8-(10, 10)-65-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[EmbeddingFP4-torch.uint8-(10, 10, 10)-64-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[EmbeddingFP4-torch.uint8-(10, 10, 10)-65-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[EmbeddingFP4-torch.float32-(10,)-64-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[EmbeddingFP4-torch.float32-(10,)-65-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[EmbeddingFP4-torch.float32-(10, 10)-64-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[EmbeddingFP4-torch.float32-(10, 10)-65-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[EmbeddingFP4-torch.float32-(10, 10, 10)-64-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[EmbeddingFP4-torch.float32-(10, 10, 10)-65-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[EmbeddingNF4-torch.uint8-(10,)-64-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[EmbeddingNF4-torch.uint8-(10,)-65-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[EmbeddingNF4-torch.uint8-(10, 10)-64-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[EmbeddingNF4-torch.uint8-(10, 10)-65-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[EmbeddingNF4-torch.uint8-(10, 10, 10)-64-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[EmbeddingNF4-torch.uint8-(10, 10, 10)-65-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[EmbeddingNF4-torch.float32-(10,)-64-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[EmbeddingNF4-torch.float32-(10,)-65-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[EmbeddingNF4-torch.float32-(10, 10)-64-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[EmbeddingNF4-torch.float32-(10, 10)-65-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[EmbeddingNF4-torch.float32-(10, 10, 10)-64-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_lossless[EmbeddingNF4-torch.float32-(10, 10, 10)-65-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_error[Embedding8bit-None-(10,)-64-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_error[Embedding8bit-None-(10,)-65-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_error[Embedding8bit-None-(10, 10)-64-cuda] PASSED [ 87%]
tests/test_modules.py::test_embedding_error[Embedding8bit-None-(10, 10)-65-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[Embedding8bit-None-(10, 10, 10)-64-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[Embedding8bit-None-(10, 10, 10)-65-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[EmbeddingFP4-torch.uint8-(10,)-64-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[EmbeddingFP4-torch.uint8-(10,)-65-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[EmbeddingFP4-torch.uint8-(10, 10)-64-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[EmbeddingFP4-torch.uint8-(10, 10)-65-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[EmbeddingFP4-torch.uint8-(10, 10, 10)-64-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[EmbeddingFP4-torch.uint8-(10, 10, 10)-65-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[EmbeddingFP4-torch.float32-(10,)-64-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[EmbeddingFP4-torch.float32-(10,)-65-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[EmbeddingFP4-torch.float32-(10, 10)-64-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[EmbeddingFP4-torch.float32-(10, 10)-65-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[EmbeddingFP4-torch.float32-(10, 10, 10)-64-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[EmbeddingFP4-torch.float32-(10, 10, 10)-65-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[EmbeddingNF4-torch.uint8-(10,)-64-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[EmbeddingNF4-torch.uint8-(10,)-65-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[EmbeddingNF4-torch.uint8-(10, 10)-64-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[EmbeddingNF4-torch.uint8-(10, 10)-65-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[EmbeddingNF4-torch.uint8-(10, 10, 10)-64-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[EmbeddingNF4-torch.uint8-(10, 10, 10)-65-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[EmbeddingNF4-torch.float32-(10,)-64-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[EmbeddingNF4-torch.float32-(10,)-65-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[EmbeddingNF4-torch.float32-(10, 10)-64-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[EmbeddingNF4-torch.float32-(10, 10)-65-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[EmbeddingNF4-torch.float32-(10, 10, 10)-64-cuda] PASSED [ 88%]
tests/test_modules.py::test_embedding_error[EmbeddingNF4-torch.float32-(10, 10, 10)-65-cuda] PASSED [ 88%]
tests/test_modules.py::test_4bit_linear_warnings[cuda] FAILED            [ 88%]
tests/test_modules.py::test_4bit_embedding_warnings[cuda] PASSED         [ 88%]
tests/test_modules.py::test_4bit_embedding_weight_fsdp_fix PASSED        [ 88%]
tests/test_modules.py::test_4bit_linear_weight_fsdp_fix PASSED           [ 88%]
tests/test_modules.py::test_embedding_not_implemented_error PASSED       [ 88%]
tests/test_ops.py::TestLLMInt8Ops::test_int8_linear_matmul[cuda] PASSED  [ 88%]
tests/test_ops.py::TestLLMInt8Ops::test_int8_linear_matmul_out[cuda] PASSED [ 88%]
tests/test_ops.py::TestLLMInt8Ops::test_int8_vectorwise_quant[cuda-0.0] PASSED [ 88%]
tests/test_ops.py::TestLLMInt8Ops::test_int8_vectorwise_quant[cuda-6.0] PASSED [ 88%]
tests/test_ops.py::TestLLMInt8Ops::test_int8_mm_dequant[cuda] PASSED     [ 88%]
tests/test_ops.py::TestLLMInt8Ops::test_int8_scaled_mm[True-dtype=fp16-cuda] PASSED [ 88%]
tests/test_ops.py::TestLLMInt8Ops::test_int8_scaled_mm[True-dtype=bf16-cuda] PASSED [ 88%]
tests/test_ops.py::TestLLMInt8Ops::test_int8_scaled_mm[True-dtype=fp32-cuda] PASSED [ 88%]
tests/test_ops.py::TestLLMInt8Ops::test_int8_scaled_mm[False-dtype=fp16-cuda] PASSED [ 89%]
tests/test_ops.py::TestLLMInt8Ops::test_int8_scaled_mm[False-dtype=bf16-cuda] PASSED [ 89%]
tests/test_ops.py::TestLLMInt8Ops::test_int8_scaled_mm[False-dtype=fp32-cuda] PASSED [ 89%]
tests/test_ops.py::TestInt8BlockwiseQuantOps::test_quantize_blockwise[64-dtype=fp16-cuda] PASSED [ 89%]
tests/test_ops.py::TestInt8BlockwiseQuantOps::test_quantize_blockwise[64-dtype=bf16-cuda] PASSED [ 89%]
tests/test_ops.py::TestInt8BlockwiseQuantOps::test_quantize_blockwise[64-dtype=fp32-cuda] PASSED [ 89%]
tests/test_ops.py::TestInt8BlockwiseQuantOps::test_quantize_blockwise[128-dtype=fp16-cuda] PASSED [ 89%]
tests/test_ops.py::TestInt8BlockwiseQuantOps::test_quantize_blockwise[128-dtype=bf16-cuda] PASSED [ 89%]
tests/test_ops.py::TestInt8BlockwiseQuantOps::test_quantize_blockwise[128-dtype=fp32-cuda] PASSED [ 89%]
tests/test_ops.py::TestInt8BlockwiseQuantOps::test_quantize_blockwise[256-dtype=fp16-cuda] PASSED [ 89%]
tests/test_ops.py::TestInt8BlockwiseQuantOps::test_quantize_blockwise[256-dtype=bf16-cuda] PASSED [ 89%]
tests/test_ops.py::TestInt8BlockwiseQuantOps::test_quantize_blockwise[256-dtype=fp32-cuda] PASSED [ 89%]
tests/test_ops.py::TestInt8BlockwiseQuantOps::test_quantize_blockwise[512-dtype=fp16-cuda] PASSED [ 89%]
tests/test_ops.py::TestInt8BlockwiseQuantOps::test_quantize_blockwise[512-dtype=bf16-cuda] PASSED [ 89%]
tests/test_ops.py::TestInt8BlockwiseQuantOps::test_quantize_blockwise[512-dtype=fp32-cuda] PASSED [ 89%]
tests/test_ops.py::TestInt8BlockwiseQuantOps::test_dequantize_blockwise[64-dtype=fp16-cuda] PASSED [ 89%]
tests/test_ops.py::TestInt8BlockwiseQuantOps::test_dequantize_blockwise[64-dtype=bf16-cuda] PASSED [ 89%]
tests/test_ops.py::TestInt8BlockwiseQuantOps::test_dequantize_blockwise[64-dtype=fp32-cuda] PASSED [ 89%]
tests/test_ops.py::TestInt8BlockwiseQuantOps::test_dequantize_blockwise[128-dtype=fp16-cuda] PASSED [ 89%]
tests/test_ops.py::TestInt8BlockwiseQuantOps::test_dequantize_blockwise[128-dtype=bf16-cuda] PASSED [ 89%]
tests/test_ops.py::TestInt8BlockwiseQuantOps::test_dequantize_blockwise[128-dtype=fp32-cuda] PASSED [ 89%]
tests/test_ops.py::TestInt8BlockwiseQuantOps::test_dequantize_blockwise[256-dtype=fp16-cuda] PASSED [ 89%]
tests/test_ops.py::TestInt8BlockwiseQuantOps::test_dequantize_blockwise[256-dtype=bf16-cuda] PASSED [ 89%]
tests/test_ops.py::TestInt8BlockwiseQuantOps::test_dequantize_blockwise[256-dtype=fp32-cuda] PASSED [ 89%]
tests/test_ops.py::TestInt8BlockwiseQuantOps::test_dequantize_blockwise[512-dtype=fp16-cuda] PASSED [ 89%]
tests/test_ops.py::TestInt8BlockwiseQuantOps::test_dequantize_blockwise[512-dtype=bf16-cuda] PASSED [ 89%]
tests/test_ops.py::TestInt8BlockwiseQuantOps::test_dequantize_blockwise[512-dtype=fp32-cuda] PASSED [ 89%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[64-fp4-storage_dtype=uint8-dtype=fp16-cuda] PASSED [ 89%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[64-fp4-storage_dtype=uint8-dtype=bf16-cuda] PASSED [ 89%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[64-fp4-storage_dtype=uint8-dtype=fp32-cuda] PASSED [ 89%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[64-fp4-storage_dtype=bf16-dtype=fp16-cuda] XFAIL [ 89%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[64-fp4-storage_dtype=bf16-dtype=bf16-cuda] XFAIL [ 89%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[64-fp4-storage_dtype=bf16-dtype=fp32-cuda] XFAIL [ 89%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[64-nf4-storage_dtype=uint8-dtype=fp16-cuda] PASSED [ 89%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[64-nf4-storage_dtype=uint8-dtype=bf16-cuda] PASSED [ 89%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[64-nf4-storage_dtype=uint8-dtype=fp32-cuda] PASSED [ 89%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[64-nf4-storage_dtype=bf16-dtype=fp16-cuda] XFAIL [ 89%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[64-nf4-storage_dtype=bf16-dtype=bf16-cuda] XFAIL [ 89%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[64-nf4-storage_dtype=bf16-dtype=fp32-cuda] XFAIL [ 89%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[128-fp4-storage_dtype=uint8-dtype=fp16-cuda] PASSED [ 89%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[128-fp4-storage_dtype=uint8-dtype=bf16-cuda] PASSED [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[128-fp4-storage_dtype=uint8-dtype=fp32-cuda] PASSED [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[128-fp4-storage_dtype=bf16-dtype=fp16-cuda] XFAIL [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[128-fp4-storage_dtype=bf16-dtype=bf16-cuda] XFAIL [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[128-fp4-storage_dtype=bf16-dtype=fp32-cuda] XFAIL [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[128-nf4-storage_dtype=uint8-dtype=fp16-cuda] PASSED [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[128-nf4-storage_dtype=uint8-dtype=bf16-cuda] PASSED [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[128-nf4-storage_dtype=uint8-dtype=fp32-cuda] PASSED [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[128-nf4-storage_dtype=bf16-dtype=fp16-cuda] XFAIL [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[128-nf4-storage_dtype=bf16-dtype=bf16-cuda] XFAIL [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[128-nf4-storage_dtype=bf16-dtype=fp32-cuda] XFAIL [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[256-fp4-storage_dtype=uint8-dtype=fp16-cuda] PASSED [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[256-fp4-storage_dtype=uint8-dtype=bf16-cuda] PASSED [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[256-fp4-storage_dtype=uint8-dtype=fp32-cuda] PASSED [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[256-fp4-storage_dtype=bf16-dtype=fp16-cuda] XFAIL [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[256-fp4-storage_dtype=bf16-dtype=bf16-cuda] XFAIL [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[256-fp4-storage_dtype=bf16-dtype=fp32-cuda] XFAIL [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[256-nf4-storage_dtype=uint8-dtype=fp16-cuda] PASSED [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[256-nf4-storage_dtype=uint8-dtype=bf16-cuda] PASSED [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[256-nf4-storage_dtype=uint8-dtype=fp32-cuda] PASSED [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[256-nf4-storage_dtype=bf16-dtype=fp16-cuda] XFAIL [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[256-nf4-storage_dtype=bf16-dtype=bf16-cuda] XFAIL [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[256-nf4-storage_dtype=bf16-dtype=fp32-cuda] XFAIL [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[512-fp4-storage_dtype=uint8-dtype=fp16-cuda] PASSED [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[512-fp4-storage_dtype=uint8-dtype=bf16-cuda] PASSED [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[512-fp4-storage_dtype=uint8-dtype=fp32-cuda] PASSED [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[512-fp4-storage_dtype=bf16-dtype=fp16-cuda] XFAIL [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[512-fp4-storage_dtype=bf16-dtype=bf16-cuda] XFAIL [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[512-fp4-storage_dtype=bf16-dtype=fp32-cuda] XFAIL [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[512-nf4-storage_dtype=uint8-dtype=fp16-cuda] PASSED [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[512-nf4-storage_dtype=uint8-dtype=bf16-cuda] PASSED [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[512-nf4-storage_dtype=uint8-dtype=fp32-cuda] PASSED [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[512-nf4-storage_dtype=bf16-dtype=fp16-cuda] XFAIL [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[512-nf4-storage_dtype=bf16-dtype=bf16-cuda] XFAIL [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_quantize_4bit[512-nf4-storage_dtype=bf16-dtype=fp32-cuda] XFAIL [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[64-fp4-storage_dtype=uint8-dtype=fp16-cuda] PASSED [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[64-fp4-storage_dtype=uint8-dtype=bf16-cuda] PASSED [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[64-fp4-storage_dtype=uint8-dtype=fp32-cuda] PASSED [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[64-fp4-storage_dtype=bf16-dtype=fp16-cuda] PASSED [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[64-fp4-storage_dtype=bf16-dtype=bf16-cuda] PASSED [ 90%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[64-fp4-storage_dtype=bf16-dtype=fp32-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[64-nf4-storage_dtype=uint8-dtype=fp16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[64-nf4-storage_dtype=uint8-dtype=bf16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[64-nf4-storage_dtype=uint8-dtype=fp32-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[64-nf4-storage_dtype=bf16-dtype=fp16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[64-nf4-storage_dtype=bf16-dtype=bf16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[64-nf4-storage_dtype=bf16-dtype=fp32-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[128-fp4-storage_dtype=uint8-dtype=fp16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[128-fp4-storage_dtype=uint8-dtype=bf16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[128-fp4-storage_dtype=uint8-dtype=fp32-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[128-fp4-storage_dtype=bf16-dtype=fp16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[128-fp4-storage_dtype=bf16-dtype=bf16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[128-fp4-storage_dtype=bf16-dtype=fp32-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[128-nf4-storage_dtype=uint8-dtype=fp16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[128-nf4-storage_dtype=uint8-dtype=bf16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[128-nf4-storage_dtype=uint8-dtype=fp32-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[128-nf4-storage_dtype=bf16-dtype=fp16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[128-nf4-storage_dtype=bf16-dtype=bf16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[128-nf4-storage_dtype=bf16-dtype=fp32-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[256-fp4-storage_dtype=uint8-dtype=fp16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[256-fp4-storage_dtype=uint8-dtype=bf16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[256-fp4-storage_dtype=uint8-dtype=fp32-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[256-fp4-storage_dtype=bf16-dtype=fp16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[256-fp4-storage_dtype=bf16-dtype=bf16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[256-fp4-storage_dtype=bf16-dtype=fp32-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[256-nf4-storage_dtype=uint8-dtype=fp16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[256-nf4-storage_dtype=uint8-dtype=bf16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[256-nf4-storage_dtype=uint8-dtype=fp32-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[256-nf4-storage_dtype=bf16-dtype=fp16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[256-nf4-storage_dtype=bf16-dtype=bf16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[256-nf4-storage_dtype=bf16-dtype=fp32-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[512-fp4-storage_dtype=uint8-dtype=fp16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[512-fp4-storage_dtype=uint8-dtype=bf16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[512-fp4-storage_dtype=uint8-dtype=fp32-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[512-fp4-storage_dtype=bf16-dtype=fp16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[512-fp4-storage_dtype=bf16-dtype=bf16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[512-fp4-storage_dtype=bf16-dtype=fp32-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[512-nf4-storage_dtype=uint8-dtype=fp16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[512-nf4-storage_dtype=uint8-dtype=bf16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[512-nf4-storage_dtype=uint8-dtype=fp32-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[512-nf4-storage_dtype=bf16-dtype=fp16-cuda] PASSED [ 91%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[512-nf4-storage_dtype=bf16-dtype=bf16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_dequantize_4bit[512-nf4-storage_dtype=bf16-dtype=fp32-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[64-fp4-storage_dtype=uint8-dtype=fp16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[64-fp4-storage_dtype=uint8-dtype=bf16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[64-fp4-storage_dtype=uint8-dtype=fp32-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[64-fp4-storage_dtype=bf16-dtype=fp16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[64-fp4-storage_dtype=bf16-dtype=bf16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[64-fp4-storage_dtype=bf16-dtype=fp32-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[64-nf4-storage_dtype=uint8-dtype=fp16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[64-nf4-storage_dtype=uint8-dtype=bf16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[64-nf4-storage_dtype=uint8-dtype=fp32-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[64-nf4-storage_dtype=bf16-dtype=fp16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[64-nf4-storage_dtype=bf16-dtype=bf16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[64-nf4-storage_dtype=bf16-dtype=fp32-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[128-fp4-storage_dtype=uint8-dtype=fp16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[128-fp4-storage_dtype=uint8-dtype=bf16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[128-fp4-storage_dtype=uint8-dtype=fp32-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[128-fp4-storage_dtype=bf16-dtype=fp16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[128-fp4-storage_dtype=bf16-dtype=bf16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[128-fp4-storage_dtype=bf16-dtype=fp32-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[128-nf4-storage_dtype=uint8-dtype=fp16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[128-nf4-storage_dtype=uint8-dtype=bf16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[128-nf4-storage_dtype=uint8-dtype=fp32-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[128-nf4-storage_dtype=bf16-dtype=fp16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[128-nf4-storage_dtype=bf16-dtype=bf16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[128-nf4-storage_dtype=bf16-dtype=fp32-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[256-fp4-storage_dtype=uint8-dtype=fp16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[256-fp4-storage_dtype=uint8-dtype=bf16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[256-fp4-storage_dtype=uint8-dtype=fp32-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[256-fp4-storage_dtype=bf16-dtype=fp16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[256-fp4-storage_dtype=bf16-dtype=bf16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[256-fp4-storage_dtype=bf16-dtype=fp32-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[256-nf4-storage_dtype=uint8-dtype=fp16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[256-nf4-storage_dtype=uint8-dtype=bf16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[256-nf4-storage_dtype=uint8-dtype=fp32-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[256-nf4-storage_dtype=bf16-dtype=fp16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[256-nf4-storage_dtype=bf16-dtype=bf16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[256-nf4-storage_dtype=bf16-dtype=fp32-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[512-fp4-storage_dtype=uint8-dtype=fp16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[512-fp4-storage_dtype=uint8-dtype=bf16-cuda] PASSED [ 92%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[512-fp4-storage_dtype=uint8-dtype=fp32-cuda] PASSED [ 93%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[512-fp4-storage_dtype=bf16-dtype=fp16-cuda] PASSED [ 93%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[512-fp4-storage_dtype=bf16-dtype=bf16-cuda] PASSED [ 93%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[512-fp4-storage_dtype=bf16-dtype=fp32-cuda] PASSED [ 93%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[512-nf4-storage_dtype=uint8-dtype=fp16-cuda] PASSED [ 93%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[512-nf4-storage_dtype=uint8-dtype=bf16-cuda] PASSED [ 93%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[512-nf4-storage_dtype=uint8-dtype=fp32-cuda] PASSED [ 93%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[512-nf4-storage_dtype=bf16-dtype=fp16-cuda] PASSED [ 93%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[512-nf4-storage_dtype=bf16-dtype=bf16-cuda] PASSED [ 93%]
tests/test_ops.py::Test4bitBlockwiseQuantOps::test_gemv_4bit[512-nf4-storage_dtype=bf16-dtype=fp32-cuda] PASSED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-fp32-opt=adam] PASSED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-fp32-opt=paged_adamw] SKIPPED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-fp32-opt=paged_adam] SKIPPED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-fp32-opt=momentum] PASSED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-fp32-opt=rmsprop] PASSED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-fp32-opt=lion] PASSED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-fp32-opt=paged_lion] SKIPPED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-fp32-opt=ademamix] PASSED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-fp32-opt=ademamix_scheduled] PASSED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-fp32-opt=paged_ademamix] SKIPPED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-fp32-opt=paged_ademamix_scheduled] SKIPPED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-fp16-opt=adam] PASSED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-fp16-opt=paged_adamw] SKIPPED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-fp16-opt=paged_adam] SKIPPED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-fp16-opt=momentum] PASSED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-fp16-opt=rmsprop] PASSED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-fp16-opt=lion] PASSED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-fp16-opt=paged_lion] SKIPPED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-fp16-opt=ademamix] PASSED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-fp16-opt=ademamix_scheduled] PASSED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-fp16-opt=paged_ademamix] SKIPPED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-fp16-opt=paged_ademamix_scheduled] SKIPPED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-bf16-opt=adam] PASSED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-bf16-opt=paged_adamw] SKIPPED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-bf16-opt=paged_adam] SKIPPED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-bf16-opt=momentum] SKIPPED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-bf16-opt=rmsprop] SKIPPED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-bf16-opt=lion] PASSED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-bf16-opt=paged_lion] SKIPPED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-bf16-opt=ademamix] PASSED [ 93%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-bf16-opt=ademamix_scheduled] PASSED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-bf16-opt=paged_ademamix] SKIPPED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=32-dim1=1024-bf16-opt=paged_ademamix_scheduled] SKIPPED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-fp32-opt=adam] PASSED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-fp32-opt=paged_adamw] SKIPPED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-fp32-opt=paged_adam] SKIPPED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-fp32-opt=momentum] PASSED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-fp32-opt=rmsprop] PASSED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-fp32-opt=lion] PASSED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-fp32-opt=paged_lion] SKIPPED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-fp32-opt=ademamix] PASSED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-fp32-opt=ademamix_scheduled] PASSED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-fp32-opt=paged_ademamix] SKIPPED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-fp32-opt=paged_ademamix_scheduled] SKIPPED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-fp16-opt=adam] PASSED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-fp16-opt=paged_adamw] SKIPPED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-fp16-opt=paged_adam] SKIPPED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-fp16-opt=momentum] PASSED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-fp16-opt=rmsprop] PASSED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-fp16-opt=lion] PASSED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-fp16-opt=paged_lion] SKIPPED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-fp16-opt=ademamix] PASSED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-fp16-opt=ademamix_scheduled] PASSED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-fp16-opt=paged_ademamix] SKIPPED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-fp16-opt=paged_ademamix_scheduled] SKIPPED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-bf16-opt=adam] PASSED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-bf16-opt=paged_adamw] SKIPPED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-bf16-opt=paged_adam] SKIPPED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-bf16-opt=momentum] SKIPPED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-bf16-opt=rmsprop] SKIPPED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-bf16-opt=lion] PASSED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-bf16-opt=paged_lion] SKIPPED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-bf16-opt=ademamix] PASSED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-bf16-opt=ademamix_scheduled] PASSED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-bf16-opt=paged_ademamix] SKIPPED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1024-dim1=1024-bf16-opt=paged_ademamix_scheduled] SKIPPED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-fp32-opt=adam] PASSED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-fp32-opt=paged_adamw] SKIPPED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-fp32-opt=paged_adam] SKIPPED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-fp32-opt=momentum] PASSED [ 94%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-fp32-opt=rmsprop] PASSED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-fp32-opt=lion] PASSED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-fp32-opt=paged_lion] SKIPPED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-fp32-opt=ademamix] PASSED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-fp32-opt=ademamix_scheduled] PASSED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-fp32-opt=paged_ademamix] SKIPPED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-fp32-opt=paged_ademamix_scheduled] SKIPPED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-fp16-opt=adam] PASSED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-fp16-opt=paged_adamw] SKIPPED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-fp16-opt=paged_adam] SKIPPED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-fp16-opt=momentum] PASSED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-fp16-opt=rmsprop] PASSED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-fp16-opt=lion] PASSED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-fp16-opt=paged_lion] SKIPPED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-fp16-opt=ademamix] PASSED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-fp16-opt=ademamix_scheduled] PASSED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-fp16-opt=paged_ademamix] SKIPPED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-fp16-opt=paged_ademamix_scheduled] SKIPPED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-bf16-opt=adam] PASSED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-bf16-opt=paged_adamw] SKIPPED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-bf16-opt=paged_adam] SKIPPED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-bf16-opt=momentum] SKIPPED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-bf16-opt=rmsprop] SKIPPED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-bf16-opt=lion] PASSED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-bf16-opt=paged_lion] SKIPPED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-bf16-opt=ademamix] PASSED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-bf16-opt=ademamix_scheduled] PASSED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-bf16-opt=paged_ademamix] SKIPPED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=4097-dim1=1024-bf16-opt=paged_ademamix_scheduled] SKIPPED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-fp32-opt=adam] PASSED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-fp32-opt=paged_adamw] SKIPPED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-fp32-opt=paged_adam] SKIPPED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-fp32-opt=momentum] PASSED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-fp32-opt=rmsprop] PASSED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-fp32-opt=lion] PASSED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-fp32-opt=paged_lion] SKIPPED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-fp32-opt=ademamix] PASSED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-fp32-opt=ademamix_scheduled] PASSED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-fp32-opt=paged_ademamix] SKIPPED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-fp32-opt=paged_ademamix_scheduled] SKIPPED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-fp16-opt=adam] PASSED [ 95%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-fp16-opt=paged_adamw] SKIPPED [ 96%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-fp16-opt=paged_adam] SKIPPED [ 96%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-fp16-opt=momentum] PASSED [ 96%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-fp16-opt=rmsprop] PASSED [ 96%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-fp16-opt=lion] PASSED [ 96%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-fp16-opt=paged_lion] SKIPPED [ 96%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-fp16-opt=ademamix] PASSED [ 96%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-fp16-opt=ademamix_scheduled] PASSED [ 96%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-fp16-opt=paged_ademamix] SKIPPED [ 96%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-fp16-opt=paged_ademamix_scheduled] SKIPPED [ 96%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-bf16-opt=adam] PASSED [ 96%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-bf16-opt=paged_adamw] SKIPPED [ 96%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-bf16-opt=paged_adam] SKIPPED [ 96%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-bf16-opt=momentum] SKIPPED [ 96%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-bf16-opt=rmsprop] SKIPPED [ 96%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-bf16-opt=lion] PASSED [ 96%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-bf16-opt=paged_lion] SKIPPED [ 96%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-bf16-opt=ademamix] PASSED [ 96%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-bf16-opt=ademamix_scheduled] PASSED [ 96%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-bf16-opt=paged_ademamix] SKIPPED [ 96%]
tests/test_optim.py::test_optimizer32bit[device=cuda-dim2=1-dim1=1024-bf16-opt=paged_ademamix_scheduled] SKIPPED [ 96%]
tests/test_optim.py::test_global_config[cuda-fp32-dim2=32-dim1=1024] PASSED [ 96%]
tests/test_optim.py::test_global_config[cuda-fp32-dim2=1024-dim1=1024] PASSED [ 96%]
tests/test_optim.py::test_global_config[cuda-fp32-dim2=4097-dim1=1024] PASSED [ 96%]
tests/test_optim.py::test_global_config[cuda-fp16-dim2=32-dim1=1024] PASSED [ 96%]
tests/test_optim.py::test_global_config[cuda-fp16-dim2=1024-dim1=1024] PASSED [ 96%]
tests/test_optim.py::test_global_config[cuda-fp16-dim2=4097-dim1=1024] PASSED [ 96%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=32-fp32-opt=adam8bit_blockwise] PASSED [ 96%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=32-fp32-opt=lion8bit_blockwise] PASSED [ 96%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=32-fp32-opt=momentum8bit_blockwise] PASSED [ 96%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=32-fp32-opt=rmsprop8bit_blockwise] PASSED [ 96%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=32-fp32-opt=ademamix8bit_blockwise] PASSED [ 96%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=32-fp32-opt=ademamix8bit_blockwise_scheduled] PASSED [ 96%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=32-fp16-opt=adam8bit_blockwise] PASSED [ 96%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=32-fp16-opt=lion8bit_blockwise] PASSED [ 96%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=32-fp16-opt=momentum8bit_blockwise] PASSED [ 96%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=32-fp16-opt=rmsprop8bit_blockwise] PASSED [ 96%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=32-fp16-opt=ademamix8bit_blockwise] PASSED [ 96%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=32-fp16-opt=ademamix8bit_blockwise_scheduled] PASSED [ 96%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=32-bf16-opt=adam8bit_blockwise] PASSED [ 96%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=32-bf16-opt=lion8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=32-bf16-opt=momentum8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=32-bf16-opt=rmsprop8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=32-bf16-opt=ademamix8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=32-bf16-opt=ademamix8bit_blockwise_scheduled] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=1024-fp32-opt=adam8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=1024-fp32-opt=lion8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=1024-fp32-opt=momentum8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=1024-fp32-opt=rmsprop8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=1024-fp32-opt=ademamix8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=1024-fp32-opt=ademamix8bit_blockwise_scheduled] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=1024-fp16-opt=adam8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=1024-fp16-opt=lion8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=1024-fp16-opt=momentum8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=1024-fp16-opt=rmsprop8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=1024-fp16-opt=ademamix8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=1024-fp16-opt=ademamix8bit_blockwise_scheduled] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=1024-bf16-opt=adam8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=1024-bf16-opt=lion8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=1024-bf16-opt=momentum8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=1024-bf16-opt=rmsprop8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=1024-bf16-opt=ademamix8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=1024-bf16-opt=ademamix8bit_blockwise_scheduled] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=4097-fp32-opt=adam8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=4097-fp32-opt=lion8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=4097-fp32-opt=momentum8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=4097-fp32-opt=rmsprop8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=4097-fp32-opt=ademamix8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=4097-fp32-opt=ademamix8bit_blockwise_scheduled] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=4097-fp16-opt=adam8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=4097-fp16-opt=lion8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=4097-fp16-opt=momentum8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=4097-fp16-opt=rmsprop8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=4097-fp16-opt=ademamix8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=4097-fp16-opt=ademamix8bit_blockwise_scheduled] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=4097-bf16-opt=adam8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=4097-bf16-opt=lion8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=4097-bf16-opt=momentum8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=4097-bf16-opt=rmsprop8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=4097-bf16-opt=ademamix8bit_blockwise] PASSED [ 97%]
tests/test_optim.py::test_optimizer8bit[cuda-dim1=1024-dim2=4097-bf16-opt=ademamix8bit_blockwise_scheduled] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[64-compress_statistics=T-nf4-fp32-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[64-compress_statistics=T-nf4-fp16-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[64-compress_statistics=T-nf4-bf16-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[64-compress_statistics=T-fp4-fp32-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[64-compress_statistics=T-fp4-fp16-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[64-compress_statistics=T-fp4-bf16-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[64-compress_statistics=F-nf4-fp32-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[64-compress_statistics=F-nf4-fp16-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[64-compress_statistics=F-nf4-bf16-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[64-compress_statistics=F-fp4-fp32-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[64-compress_statistics=F-fp4-fp16-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[64-compress_statistics=F-fp4-bf16-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[128-compress_statistics=T-nf4-fp32-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[128-compress_statistics=T-nf4-fp16-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[128-compress_statistics=T-nf4-bf16-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[128-compress_statistics=T-fp4-fp32-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[128-compress_statistics=T-fp4-fp16-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[128-compress_statistics=T-fp4-bf16-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[128-compress_statistics=F-nf4-fp32-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[128-compress_statistics=F-nf4-fp16-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[128-compress_statistics=F-nf4-bf16-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[128-compress_statistics=F-fp4-fp32-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[128-compress_statistics=F-fp4-fp16-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[128-compress_statistics=F-fp4-bf16-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[256-compress_statistics=T-nf4-fp32-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[256-compress_statistics=T-nf4-fp16-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[256-compress_statistics=T-nf4-bf16-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[256-compress_statistics=T-fp4-fp32-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[256-compress_statistics=T-fp4-fp16-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[256-compress_statistics=T-fp4-bf16-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[256-compress_statistics=F-nf4-fp32-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[256-compress_statistics=F-nf4-fp16-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[256-compress_statistics=F-nf4-bf16-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[256-compress_statistics=F-fp4-fp32-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[256-compress_statistics=F-fp4-fp16-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_replace_parameter_4bit[256-compress_statistics=F-fp4-bf16-cuda] PASSED [ 98%]
tests/test_parametrize.py::test_moe_parameter_shape[fp32-cuda] PASSED    [ 98%]
tests/test_parametrize.py::test_moe_parameter_shape[fp16-cuda] PASSED    [ 98%]
tests/test_parametrize.py::test_moe_parameter_shape[bf16-cuda] PASSED    [ 98%]
tests/test_parametrize.py::test_prequantized_replacement[nf4-fp32-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_prequantized_replacement[nf4-fp16-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_prequantized_replacement[nf4-bf16-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_prequantized_replacement[fp4-fp32-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_prequantized_replacement[fp4-fp16-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_prequantized_replacement[fp4-bf16-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_state_dict_functionality[compress_statistics=T-nf4-fp32-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_state_dict_functionality[compress_statistics=T-nf4-fp16-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_state_dict_functionality[compress_statistics=T-nf4-bf16-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_state_dict_functionality[compress_statistics=T-fp4-fp32-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_state_dict_functionality[compress_statistics=T-fp4-fp16-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_state_dict_functionality[compress_statistics=T-fp4-bf16-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_state_dict_functionality[compress_statistics=F-nf4-fp32-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_state_dict_functionality[compress_statistics=F-nf4-fp16-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_state_dict_functionality[compress_statistics=F-nf4-bf16-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_state_dict_functionality[compress_statistics=F-fp4-fp32-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_state_dict_functionality[compress_statistics=F-fp4-fp16-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_state_dict_functionality[compress_statistics=F-fp4-bf16-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_moe_realistic_forward[fp32-cuda] PASSED  [ 99%]
tests/test_parametrize.py::test_moe_realistic_forward[fp16-cuda] PASSED  [ 99%]
tests/test_parametrize.py::test_moe_realistic_forward[bf16-cuda] PASSED  [ 99%]
tests/test_parametrize.py::test_error_conditions PASSED                  [ 99%]
tests/test_parametrize.py::test_quant_state_preservation[fp32-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_quant_state_preservation[fp16-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_quant_state_preservation[bf16-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_multiple_parameters[fp32-cuda] PASSED    [ 99%]
tests/test_parametrize.py::test_multiple_parameters[fp16-cuda] PASSED    [ 99%]
tests/test_parametrize.py::test_multiple_parameters[bf16-cuda] PASSED    [ 99%]
tests/test_parametrize.py::test_different_blocksizes[64-fp32-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_different_blocksizes[64-fp16-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_different_blocksizes[64-bf16-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_different_blocksizes[128-fp32-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_different_blocksizes[128-fp16-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_different_blocksizes[128-bf16-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_different_blocksizes[256-fp32-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_different_blocksizes[256-fp16-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_different_blocksizes[256-bf16-cuda] PASSED [ 99%]
tests/test_parametrize.py::test_parametrization_forward_method PASSED    [ 99%]
tests/test_parametrize.py::test_gradient_behavior[fp32-cuda] PASSED      [ 99%]
tests/test_parametrize.py::test_gradient_behavior[fp16-cuda] PASSED      [ 99%]
tests/test_parametrize.py::test_gradient_behavior[bf16-cuda] PASSED      [100%]

================================== FAILURES ===================================
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=128-uint8-fp32-fc2-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40A35940>
device = 'cuda', dim = 128, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.uint8, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 2.754823385793311e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=128-uint8-fp32-fc2-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40A35A00>
device = 'cuda', dim = 128, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.uint8, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 2.781692001719006e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=128-uint8-fp32-fc2-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40A35AC0>
device = 'cuda', dim = 128, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.uint8, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
>               assert err1 < 5e-8
E               assert 5.2071000295163325e-08 < 5e-08

tests\test_functional.py:1408: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=128-uint8-fp32-fc2-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40A35B80>
device = 'cuda', dim = 128, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.uint8, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
>               assert err1 < 5e-8
E               assert 5.2257069163957734e-08 < 5e-08

tests\test_functional.py:1408: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=128-fp16-fp32-fc2-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40A37D40>
device = 'cuda', dim = 128, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float16, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 2.754823385793311e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=128-fp16-fp32-fc2-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40A37E00>
device = 'cuda', dim = 128, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float16, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 2.781692001719006e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=128-fp16-fp32-fc2-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40A37EC0>
device = 'cuda', dim = 128, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float16, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
>               assert err1 < 5e-8
E               assert 5.2071000295163325e-08 < 5e-08

tests\test_functional.py:1408: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=128-fp16-fp32-fc2-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40A37F80>
device = 'cuda', dim = 128, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float16, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
>               assert err1 < 5e-8
E               assert 5.2257069163957734e-08 < 5e-08

tests\test_functional.py:1408: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=128-bf16-fp32-fc2-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40B5CCB0>
device = 'cuda', dim = 128, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.bfloat16, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 2.754823385793311e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=128-bf16-fp32-fc2-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40B5CD70>
device = 'cuda', dim = 128, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.bfloat16, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 2.781692001719006e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=128-bf16-fp32-fc2-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40B5CE30>
device = 'cuda', dim = 128, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.bfloat16, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
>               assert err1 < 5e-8
E               assert 5.2071000295163325e-08 < 5e-08

tests\test_functional.py:1408: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=128-bf16-fp32-fc2-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40B5CEF0>
device = 'cuda', dim = 128, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.bfloat16, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
>               assert err1 < 5e-8
E               assert 5.2257069163957734e-08 < 5e-08

tests\test_functional.py:1408: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=128-fp32-fp32-fc2-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40B5F0B0>
device = 'cuda', dim = 128, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float32, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 2.754823385793311e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=128-fp32-fp32-fc2-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40B5F170>
device = 'cuda', dim = 128, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float32, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 2.781692001719006e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=128-fp32-fp32-fc2-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40B5F230>
device = 'cuda', dim = 128, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float32, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
>               assert err1 < 5e-8
E               assert 5.2071000295163325e-08 < 5e-08

tests\test_functional.py:1408: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=128-fp32-fp32-fc2-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40B5F2F0>
device = 'cuda', dim = 128, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float32, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
>               assert err1 < 5e-8
E               assert 5.2257069163957734e-08 < 5e-08

tests\test_functional.py:1408: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-uint8-fp32-fc1-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40A891C0>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.uint8, double_quant = True, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.0531395673751831e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-uint8-fp32-fc1-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40A89340>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.uint8, double_quant = True, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.101568341255188e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-uint8-fp32-fc1-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40A89400>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.uint8, double_quant = False, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1228024959564209e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-uint8-fp32-fc2-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40A894C0>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.uint8, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 3.2232608646154406e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-uint8-fp32-fc2-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40A89580>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.uint8, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 3.061443567276001e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-uint8-fp32-fc2-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40A89640>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.uint8, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
>               assert err1 < 5e-8
E               assert 5.151305813910767e-08 < 5e-08

tests\test_functional.py:1408: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-uint8-fp32-fc2-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40A89700>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.uint8, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
>               assert err1 < 5e-8
E               assert 5.141680848197439e-08 < 5e-08

tests\test_functional.py:1408: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-fp16-fp32-fc1-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40A8B5C0>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float16, double_quant = True, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.0531395673751831e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-fp16-fp32-fc1-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40A8B740>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float16, double_quant = True, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.101568341255188e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-fp16-fp32-fc1-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40A8B800>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float16, double_quant = False, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1228024959564209e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-fp16-fp32-fc2-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40A8B8C0>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float16, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 3.2232608646154406e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-fp16-fp32-fc2-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40A8B980>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float16, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 3.061443567276001e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-fp16-fp32-fc2-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40A8BA40>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float16, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
>               assert err1 < 5e-8
E               assert 5.151305813910767e-08 < 5e-08

tests\test_functional.py:1408: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-fp16-fp32-fc2-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40A8BB00>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float16, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
>               assert err1 < 5e-8
E               assert 5.141680848197439e-08 < 5e-08

tests\test_functional.py:1408: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-bf16-fp32-fc1-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40AB1A30>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.bfloat16, double_quant = True, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.0531395673751831e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-bf16-fp32-fc1-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40AB1BB0>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.bfloat16, double_quant = True, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.101568341255188e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-bf16-fp32-fc1-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40AB1C70>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.bfloat16, double_quant = False, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1228024959564209e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-bf16-fp32-fc2-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40AB1D30>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.bfloat16, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 3.2232608646154406e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-bf16-fp32-fc2-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40AB1DF0>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.bfloat16, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 3.061443567276001e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-bf16-fp32-fc2-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40AB1EB0>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.bfloat16, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
>               assert err1 < 5e-8
E               assert 5.151305813910767e-08 < 5e-08

tests\test_functional.py:1408: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-bf16-fp32-fc2-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40AB1F70>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.bfloat16, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
>               assert err1 < 5e-8
E               assert 5.141680848197439e-08 < 5e-08

tests\test_functional.py:1408: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-fp32-fp32-fc1-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40AB3E30>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float32, double_quant = True, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.0531395673751831e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-fp32-fp32-fc1-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40AB3FB0>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float32, double_quant = True, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.101568341255188e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-fp32-fp32-fc1-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40ADC0B0>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float32, double_quant = False, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1228024959564209e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-fp32-fp32-fc2-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40ADC170>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float32, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 3.2232608646154406e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-fp32-fp32-fc2-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40ADC230>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float32, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 3.061443567276001e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-fp32-fp32-fc2-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40ADC2F0>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float32, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
>               assert err1 < 5e-8
E               assert 5.151305813910767e-08 < 5e-08

tests\test_functional.py:1408: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=256-fp32-fp32-fc2-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40ADC3B0>
device = 'cuda', dim = 256, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float32, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
>               assert err1 < 5e-8
E               assert 5.141680848197439e-08 < 5e-08

tests\test_functional.py:1408: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-uint8-fp32-fc1-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40ADE270>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.uint8, double_quant = True, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.125798300009158e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-uint8-fp32-fc1-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40ADE330>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.uint8, double_quant = False, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1279550332728015e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-uint8-fp32-fc1-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40ADE3F0>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.uint8, double_quant = True, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1843264431561222e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-uint8-fp32-fc1-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40ADE4B0>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.uint8, double_quant = False, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1704311540376867e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-uint8-fp32-fc2-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40ADE570>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.uint8, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 3.5766869317557444e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-uint8-fp32-fc2-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40ADE630>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.uint8, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 3.6562391083200575e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-uint8-fp32-fc2-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40ADE6F0>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.uint8, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
>               assert err1 < 5e-8
E               assert 5.1333022272426384e-08 < 5e-08

tests\test_functional.py:1408: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-uint8-fp32-fc2-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40ADE7B0>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.uint8, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
>               assert err1 < 5e-8
E               assert 5.1402038864314965e-08 < 5e-08

tests\test_functional.py:1408: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-uint8-fp32-attn_packed-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40ADEB70>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.uint8, double_quant = True, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1263745264536429e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-uint8-fp32-attn_packed-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40ADEC30>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.uint8, double_quant = False, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.0624957091793192e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-uint8-fp32-attn_packed-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40ADECF0>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.uint8, double_quant = True, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1323014270254874e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-uint8-fp32-attn_packed-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40ADEDB0>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.uint8, double_quant = False, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1261111086504499e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-fp16-fp32-fc1-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BA06B0>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float16, double_quant = True, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.125798300009158e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-fp16-fp32-fc1-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BA0770>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float16, double_quant = False, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1279550332728015e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-fp16-fp32-fc1-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BA0830>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float16, double_quant = True, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1843264431561222e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-fp16-fp32-fc1-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BA08F0>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float16, double_quant = False, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1704311540376867e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-fp16-fp32-fc2-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BA09B0>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float16, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 3.5766869317557444e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-fp16-fp32-fc2-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BA0A70>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float16, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 3.6562391083200575e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-fp16-fp32-fc2-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BA0B30>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float16, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
>               assert err1 < 5e-8
E               assert 5.1333022272426384e-08 < 5e-08

tests\test_functional.py:1408: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-fp16-fp32-fc2-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BA0BF0>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float16, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
>               assert err1 < 5e-8
E               assert 5.1402038864314965e-08 < 5e-08

tests\test_functional.py:1408: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-fp16-fp32-attn_packed-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BA0FB0>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float16, double_quant = True, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1263745264536429e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-fp16-fp32-attn_packed-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BA1070>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float16, double_quant = False, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.0624957091793192e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-fp16-fp32-attn_packed-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BA1130>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float16, double_quant = True, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1323014270254874e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-fp16-fp32-attn_packed-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BA11F0>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float16, double_quant = False, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1261111086504499e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-bf16-fp32-fc1-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BA2AB0>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.bfloat16, double_quant = True, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.125798300009158e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-bf16-fp32-fc1-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BA2B70>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.bfloat16, double_quant = False, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1279550332728015e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-bf16-fp32-fc1-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BA2C30>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.bfloat16, double_quant = True, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1843264431561222e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-bf16-fp32-fc1-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BA2CF0>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.bfloat16, double_quant = False, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1704311540376867e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-bf16-fp32-fc2-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BA2DB0>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.bfloat16, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 3.5766869317557444e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-bf16-fp32-fc2-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BA2E70>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.bfloat16, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 3.6562391083200575e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-bf16-fp32-fc2-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BA2F30>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.bfloat16, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
>               assert err1 < 5e-8
E               assert 5.1333022272426384e-08 < 5e-08

tests\test_functional.py:1408: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-bf16-fp32-fc2-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BA2FF0>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.bfloat16, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
>               assert err1 < 5e-8
E               assert 5.1402038864314965e-08 < 5e-08

tests\test_functional.py:1408: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-bf16-fp32-attn_packed-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BA33B0>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.bfloat16, double_quant = True, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1263745264536429e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-bf16-fp32-attn_packed-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BA3470>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.bfloat16, double_quant = False, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.0624957091793192e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-bf16-fp32-attn_packed-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BA3530>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.bfloat16, double_quant = True, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1323014270254874e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-bf16-fp32-attn_packed-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BA35F0>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.bfloat16, double_quant = False, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1261111086504499e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-fp32-fp32-fc1-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BC8EF0>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float32, double_quant = True, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.125798300009158e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-fp32-fp32-fc1-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BC8FB0>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float32, double_quant = False, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1279550332728015e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-fp32-fp32-fc1-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BC9070>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float32, double_quant = True, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1843264431561222e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-fp32-fp32-fc1-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BC9130>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float32, double_quant = False, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1704311540376867e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-fp32-fp32-fc2-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BC91F0>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float32, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 3.5766869317557444e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-fp32-fp32-fc2-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BC92B0>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float32, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 3.6562391083200575e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-fp32-fp32-fc2-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BC9370>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float32, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
>               assert err1 < 5e-8
E               assert 5.1333022272426384e-08 < 5e-08

tests\test_functional.py:1408: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-fp32-fp32-fc2-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BC9430>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float32, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
>               assert err1 < 5e-8
E               assert 5.1402038864314965e-08 < 5e-08

tests\test_functional.py:1408: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-fp32-fp32-attn_packed-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BC97F0>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float32, double_quant = True, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1263745264536429e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-fp32-fp32-attn_packed-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BC98B0>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float32, double_quant = False, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.0624957091793192e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-fp32-fp32-attn_packed-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BC9970>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float32, double_quant = True, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1323014270254874e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=512-fp32-fp32-attn_packed-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BC9A30>
device = 'cuda', dim = 512, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float32, double_quant = False, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
>               assert maxerr1 < 1.05e-7
E               assert 1.1261111086504499e-07 < 1.05e-07

tests\test_functional.py:1410: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-uint8-fp32-fc1-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BCB2F0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.uint8, double_quant = True, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.2226402759552002e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-uint8-fp32-fc1-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BCB3B0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.uint8, double_quant = False, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.2289732694625855e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-uint8-fp32-fc1-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BCB470>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.uint8, double_quant = True, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.2608245015144348e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-uint8-fp32-fc1-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BCB530>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.uint8, double_quant = False, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.2908130884170532e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-uint8-fp32-fc2-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BCB5F0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.uint8, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 4.0937215089797974e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-uint8-fp32-fc2-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BCB6B0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.uint8, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 4.094839096069336e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-uint8-fp32-fc2-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BCB770>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.uint8, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
>               assert err1 < 5e-8
E               assert 5.1653077939306514e-08 < 5e-08

tests\test_functional.py:1412: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-uint8-fp32-fc2-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BCB830>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.uint8, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
>               assert err1 < 5e-8
E               assert 5.175197777873564e-08 < 5e-08

tests\test_functional.py:1412: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-uint8-fp32-attn-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BCB8F0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.uint8, double_quant = True, kind = 'attn'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.0050833225250244e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-uint8-fp32-attn-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BCB9B0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.uint8, double_quant = False, kind = 'attn'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.0270625352859497e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-uint8-fp32-attn-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BCBA70>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.uint8, double_quant = True, kind = 'attn'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.0348856449127198e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-uint8-fp32-attn-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BCBB30>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.uint8, double_quant = False, kind = 'attn'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.0695308446884156e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-uint8-fp32-attn_packed-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BCBBF0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.uint8, double_quant = True, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.1809170246124267e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-uint8-fp32-attn_packed-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BCBCB0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.uint8, double_quant = False, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.1586584150791168e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-uint8-fp32-attn_packed-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BCBD70>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.uint8, double_quant = True, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.2051314115524293e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-uint8-fp32-attn_packed-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BCBE30>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.uint8, double_quant = False, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.276843249797821e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp16-fp32-fc1-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BF5730>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float16, double_quant = True, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.2226402759552002e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp16-fp32-fc1-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BF57F0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float16, double_quant = False, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.2289732694625855e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp16-fp32-fc1-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BF58B0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float16, double_quant = True, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.2608245015144348e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp16-fp32-fc1-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BF5970>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float16, double_quant = False, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.2908130884170532e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp16-fp32-fc2-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BF5A30>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float16, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 4.0937215089797974e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp16-fp32-fc2-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BF5AF0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float16, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 4.094839096069336e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp16-fp32-fc2-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BF5BB0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float16, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
>               assert err1 < 5e-8
E               assert 5.1653077939306514e-08 < 5e-08

tests\test_functional.py:1412: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp16-fp32-fc2-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BF5C70>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float16, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
>               assert err1 < 5e-8
E               assert 5.175197777873564e-08 < 5e-08

tests\test_functional.py:1412: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp16-fp32-attn-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BF5D30>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float16, double_quant = True, kind = 'attn'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.0050833225250244e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp16-fp32-attn-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BF5DF0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float16, double_quant = False, kind = 'attn'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.0270625352859497e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp16-fp32-attn-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BF5EB0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float16, double_quant = True, kind = 'attn'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.0348856449127198e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp16-fp32-attn-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BF5F70>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float16, double_quant = False, kind = 'attn'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.0695308446884156e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp16-fp32-attn_packed-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BF6030>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float16, double_quant = True, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.1809170246124267e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp16-fp32-attn_packed-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BF60F0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float16, double_quant = False, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.1586584150791168e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp16-fp32-attn_packed-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BF61B0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float16, double_quant = True, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.2051314115524293e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp16-fp32-attn_packed-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BF6270>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float16, double_quant = False, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.276843249797821e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-bf16-fp32-fc1-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BF7B30>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.bfloat16, double_quant = True, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.2226402759552002e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-bf16-fp32-fc1-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BF7BF0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.bfloat16, double_quant = False, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.2289732694625855e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-bf16-fp32-fc1-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BF7CB0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.bfloat16, double_quant = True, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.2608245015144348e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-bf16-fp32-fc1-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BF7D70>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.bfloat16, double_quant = False, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.2908130884170532e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-bf16-fp32-fc2-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BF7E30>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.bfloat16, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 4.0937215089797974e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-bf16-fp32-fc2-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BF7EF0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.bfloat16, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 4.094839096069336e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-bf16-fp32-fc2-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40BF7FB0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.bfloat16, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
>               assert err1 < 5e-8
E               assert 5.1653077939306514e-08 < 5e-08

tests\test_functional.py:1412: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-bf16-fp32-fc2-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C240B0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.bfloat16, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
>               assert err1 < 5e-8
E               assert 5.175197777873564e-08 < 5e-08

tests\test_functional.py:1412: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-bf16-fp32-attn-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C24170>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.bfloat16, double_quant = True, kind = 'attn'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.0050833225250244e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-bf16-fp32-attn-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C24230>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.bfloat16, double_quant = False, kind = 'attn'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.0270625352859497e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-bf16-fp32-attn-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C242F0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.bfloat16, double_quant = True, kind = 'attn'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.0348856449127198e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-bf16-fp32-attn-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C243B0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.bfloat16, double_quant = False, kind = 'attn'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.0695308446884156e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-bf16-fp32-attn_packed-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C24470>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.bfloat16, double_quant = True, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.1809170246124267e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-bf16-fp32-attn_packed-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C24530>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.bfloat16, double_quant = False, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.1586584150791168e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-bf16-fp32-attn_packed-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C245F0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.bfloat16, double_quant = True, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.2051314115524293e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-bf16-fp32-attn_packed-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C246B0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.bfloat16, double_quant = False, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.276843249797821e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp32-fp32-fc1-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C25F70>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float32, double_quant = True, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.2226402759552002e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp32-fp32-fc1-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C26030>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float32, double_quant = False, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.2289732694625855e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp32-fp32-fc1-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C260F0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float32, double_quant = True, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.2608245015144348e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp32-fp32-fc1-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C261B0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float32, double_quant = False, kind = 'fc1'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.2908130884170532e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp32-fp32-fc2-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C26270>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float32, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 4.0937215089797974e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp32-fp32-fc2-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C26330>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float32, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 4.094839096069336e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp32-fp32-fc2-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C263F0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float32, double_quant = True, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
>               assert err1 < 5e-8
E               assert 5.1653077939306514e-08 < 5e-08

tests\test_functional.py:1412: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp32-fp32-fc2-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C264B0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float32, double_quant = False, kind = 'fc2'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
>               assert err1 < 5e-8
E               assert 5.175197777873564e-08 < 5e-08

tests\test_functional.py:1412: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp32-fp32-attn-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C26570>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float32, double_quant = True, kind = 'attn'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.0050833225250244e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp32-fp32-attn-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C26630>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float32, double_quant = False, kind = 'attn'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.0270625352859497e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp32-fp32-attn-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C266F0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float32, double_quant = True, kind = 'attn'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.0348856449127198e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp32-fp32-attn-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C267B0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float32, double_quant = False, kind = 'attn'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.0695308446884156e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp32-fp32-attn_packed-nf4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C26870>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float32, double_quant = True, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.1809170246124267e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp32-fp32-attn_packed-nf4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C26930>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'nf4'
quant_storage = torch.float32, double_quant = False, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.1586584150791168e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp32-fp32-attn_packed-fp4-DQ_True-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C269F0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float32, double_quant = True, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.2051314115524293e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_ TestQuantize4BitFunctional.test_gemv_4bit[dim=1024-fp32-fp32-attn_packed-fp4-DQ_False-cuda] _

self = <tests.test_functional.TestQuantize4BitFunctional object at 0x0000025C40C26AB0>
device = 'cuda', dim = 1024, dtype = torch.float32, storage_type = 'fp4'
quant_storage = torch.float32, double_quant = False, kind = 'attn_packed'

    @pytest.mark.skipif(
        ROCM_WARP_SIZE_64, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
    )
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
    @pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
    @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize(
        "quant_storage",
        [torch.uint8, torch.float16, torch.bfloat16, torch.float32],
        ids=describe_dtype,
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
            pytest.skip("This configuration is not supported on HPU.")
    
        errs1 = []
        errs2 = []
        errs3 = []
        relerrs1 = []
        relerrs2 = []
        relerrs3 = []
        max_errs1 = []
        max_errs2 = []
        max_errs3 = []
    
        # Large number of iterations is excessive and slow on CPU.
        # Keep for CUDA/XPU for now.
        iters = 10 if device == "cpu" else 100
    
        for i in range(iters):
            if kind == "fc1":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 4, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "fc2":
                A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
                B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
            elif kind == "attn_packed":
                A = torch.randn(1, dim, dtype=dtype, device=device)
                B = torch.randn(dim * 3, dim, dtype=dtype, device=device) / math.sqrt(dim)
    
            qB, state = F.quantize_4bit(
                B,
                quant_type=storage_type,
                compress_statistics=double_quant,
                quant_storage=quant_storage,
            )
            C3 = torch.matmul(A, B.t())
            # CPU requires convert weight packed for gemv
            if device == "cpu" and F.has_avx512bf16():
                qB, state = F._convert_weight_packed_for_cpu(qB, state)
                qB = qB.t()
            C2 = F.gemv_4bit(A, qB.t(), state=state)
            A.requires_grad = True
            C1 = bnb.matmul_4bit(A, qB.t(), state)
    
            err1 = (C1 - C2).abs().float()
            err2 = (C3 - C2).abs().float()
            err3 = (C3 - C1).abs().float()
    
            mag1 = torch.abs(C1).float() + 1e-5
            mag2 = torch.abs(C3).float() + 1e-5
            mag3 = torch.abs(C3).float() + 1e-5
    
            relerr1 = err1 / mag1
            relerr2 = err2 / mag2
            relerr3 = err3 / mag3
    
            max_err1 = err1.max()
            max_err2 = err2.max()
            max_err3 = err3.max()
    
            errs1.append(err1.mean().item())
            errs2.append(err2.mean().item())
            errs3.append(err3.mean().item())
    
            relerrs1.append(relerr1.mean().item())
            relerrs2.append(relerr2.mean().item())
            relerrs3.append(relerr3.mean().item())
    
            max_errs1.append(max_err1.item())
            max_errs2.append(max_err2.item())
            max_errs3.append(max_err3.item())
    
            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
    
            c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=0, throw=False)
        err1 = sum(errs1) / len(errs1) / math.sqrt(dim)
        err2 = sum(errs2) / len(errs2) / math.sqrt(dim)
        err3 = sum(errs3) / len(errs3) / math.sqrt(dim)
        relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim)
        relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim)
        relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim)
        maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim)
        maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim)
        maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim)
        absratio = err2 / err3
        relratio = relerr2 / relerr3
        maxratio = relerr2 / relerr3
    
        # for debugging if the tests fails
        #
        # print('='*80)
        # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
        # print(C1.flatten()[-20:])
        # print(C2.flatten()[-20:])
        # print(f'inference vs training abs: {err1}')
        # print(f'inference vs training rel: {relerr1}')
        # print(f'inference vs training max: {maxerr1}')
        # print(f'inference vs training vs torch err ratio abs: {absratio}')
        # print(f'inference vs training vs torch err ratio rel: {relratio}')
        # print(f'inference vs training vs torch err ratio max: {maxratio}')
        if dtype == torch.float16:
            if dim <= 512:
                assert err1 < 7e-5
    
                # TODO(matthewdouglas): On T4, dim=128-fp16-fc2-fp4-DQ will have relerror ~ 0.00092727
                if (
                    device == "cuda"
                    and double_quant
                    and storage_type == "fp4"
                    and kind == "fc2"
                    and torch.cuda.get_device_capability() == (7, 5)
                ):
                    assert relerr1 < 0.00093
                else:
                    assert relerr1 < 0.0008
            else:
                assert err1 < 6e-5
                assert relerr1 < 2e-4
            assert absratio < 1.005 and absratio > 0.995
            assert relratio < 1.005 and relratio > 0.992
            assert maxratio < 1.005 and maxratio > 0.992
        elif dtype == torch.float32:
            if dim <= 512:
                assert err1 < 5e-8
                assert relerr1 < 1e-6
                assert maxerr1 < 1.05e-7
            else:
                assert err1 < 5e-8
                assert relerr1 < 8e-6
>               assert maxerr1 < 1e-7
E               assert 1.276843249797821e-07 < 1e-07

tests\test_functional.py:1414: AssertionError
_______________________ test_4bit_linear_warnings[cuda] _______________________

device = 'cuda'

    @pytest.mark.parametrize("device", get_available_devices())
    def test_4bit_linear_warnings(device):
        dim1 = 64
    
        with pytest.warns(UserWarning, match=r"inference or training"):
            net = nn.Sequential(*[bnb.nn.Linear4bit(dim1, dim1, quant_type="nf4") for i in range(10)])
            net = net.to(device)
            inp = torch.rand(10, dim1, device=device, dtype=torch.float16)
            net(inp)
        with pytest.warns(UserWarning, match=r"inference."):
            net = nn.Sequential(*[bnb.nn.Linear4bit(dim1, dim1, quant_type="nf4") for i in range(10)])
            net = net.to(device)
            inp = torch.rand(1, dim1, device=device, dtype=torch.float16)
            net(inp)
    
        with pytest.warns(UserWarning) as record:
            net = nn.Sequential(*[bnb.nn.Linear4bit(dim1, dim1, quant_type="nf4") for i in range(10)])
            net = net.to(device)
            inp = torch.rand(10, dim1, device=device, dtype=torch.float16)
            net(inp)
    
            net = nn.Sequential(*[bnb.nn.Linear4bit(dim1, dim1, quant_type="nf4") for i in range(10)])
            net = net.to(device)
            inp = torch.rand(1, dim1, device=device, dtype=torch.float16)
            net(inp)
    
>       assert len(record) == 2
E       assert 23 == 2
E        +  where 23 = len(WarningsChecker(record=True))

tests\test_modules.py:481: AssertionError
============================== warnings summary ===============================
venv\Lib\site-packages\torch\jit\_script.py:365: 14 warnings
  C:\projects\bnb\venv\Lib\site-packages\torch\jit\_script.py:365: DeprecationWarning: `torch.jit.script_method` is deprecated. Please switch to `torch.compile` or `torch.export`.
    warnings.warn(

tests/test_autograd.py: 96 warnings
  C:\projects\bnb\bitsandbytes\autograd\_functions.py:140: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more information. (Triggered internally at B:\src\torch\build\aten\src\ATen/core/TensorBody.h:499.)
    has_grad = getattr(B, "grad", None) is not None

tests/test_autograd.py: 96 warnings
  C:\projects\bnb\bitsandbytes\research\autograd\_functions.py:238: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more information. (Triggered internally at B:\src\torch\build\aten\src\ATen/core/TensorBody.h:499.)
    has_grad = getattr(B, "grad", None) is not None

tests/test_autograd.py: 128 warnings
tests/test_modules.py: 1 warning
  C:\projects\bnb\bitsandbytes\autograd\_functions.py:123: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
    warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")

tests/test_autograd.py: 128 warnings
  C:\projects\bnb\bitsandbytes\research\autograd\_functions.py:213: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
    warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")

tests/test_autograd.py: 128 warnings
  C:\projects\bnb\bitsandbytes\autograd\_functions.py:123: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
    warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")

tests/test_autograd.py: 128 warnings
  C:\projects\bnb\bitsandbytes\research\autograd\_functions.py:213: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization
    warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")

tests/test_autograd.py: 256 warnings
tests/test_functional.py: 501 warnings
tests/test_linear4bit.py: 142 warnings
tests/test_modules.py: 66 warnings
tests/test_ops.py: 36 warnings
tests/test_parametrize.py: 27 warnings
  C:\projects\bnb\bitsandbytes\backends\cuda\ops.py:212: FutureWarning: _check_is_size will be removed in a future PyTorch release along with guard_size_oblivious.     Use _check(i >= 0) instead.
    torch._check_is_size(blocksize)

tests/test_functional.py: 774 warnings
tests/test_modules.py: 3 warnings
tests/test_ops.py: 144 warnings
  C:\projects\bnb\bitsandbytes\backends\cuda\ops.py:464: FutureWarning: _check_is_size will be removed in a future PyTorch release along with guard_size_oblivious.     Use _check(i >= 0) instead.
    torch._check_is_size(blocksize)

tests/test_modules.py::test_linear_kbit_fp32_bias[NF4-cuda]
tests/test_modules.py::test_kbit_backprop[dtype0-4bit-cuda]
tests/test_modules.py::test_kbit_backprop[dtype0-FP4-cuda]
tests/test_modules.py::test_kbit_backprop[dtype0-NF4-cuda]
tests/test_modules.py::test_kbit_backprop[dtype0-FP4+C-cuda]
tests/test_modules.py::test_kbit_backprop[dtype0-NF4+C-cuda]
  C:\projects\bnb\bitsandbytes\nn\modules.py:508: UserWarning: Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.
    warnings.warn(

tests/test_modules.py: 24 warnings
  C:\projects\bnb\bitsandbytes\nn\modules.py:844: UserWarning: Embedding size 65 is not divisible by block size 64. This will lead to slow inference.
    warnings.warn(

tests/test_modules.py::test_4bit_embedding_weight_fsdp_fix
tests/test_modules.py::test_embedding_not_implemented_error
  C:\projects\bnb\bitsandbytes\nn\modules.py:844: UserWarning: Embedding size 32 is not divisible by block size 64. This will lead to slow inference.
    warnings.warn(

tests/test_ops.py: 48 warnings
  C:\projects\bnb\bitsandbytes\_ops.py:266: FutureWarning: _check_is_size will be removed in a future PyTorch release along with guard_size_oblivious.     Use _check(i >= 0) instead.
    torch._check_is_size(blocksize)

tests/test_ops.py: 48 warnings
  C:\projects\bnb\bitsandbytes\_ops.py:239: FutureWarning: _check_is_size will be removed in a future PyTorch release along with guard_size_oblivious.     Use _check(i >= 0) instead.
    torch._check_is_size(blocksize)

tests/test_ops.py: 96 warnings
  C:\projects\bnb\bitsandbytes\_ops.py:222: FutureWarning: _check_is_size will be removed in a future PyTorch release along with guard_size_oblivious.     Use _check(i >= 0) instead.
    torch._check_is_size(blocksize)

tests/test_ops.py: 192 warnings
  C:\projects\bnb\bitsandbytes\_ops.py:186: FutureWarning: _check_is_size will be removed in a future PyTorch release along with guard_size_oblivious.     Use _check(i >= 0) instead.
    torch._check_is_size(blocksize)

tests/test_ops.py: 192 warnings
  C:\projects\bnb\bitsandbytes\_ops.py:284: FutureWarning: _check_is_size will be removed in a future PyTorch release along with guard_size_oblivious.     Use _check(i >= 0) instead.
    torch._check_is_size(blocksize)

tests/test_parametrize.py::test_parametrization_forward_method
  C:\projects\bnb\bitsandbytes\backends\default\ops.py:223: FutureWarning: _check_is_size will be removed in a future PyTorch release along with guard_size_oblivious.     Use _check(i >= 0) instead.
    torch._check_is_size(blocksize)

tests/test_parametrize.py::test_parametrization_forward_method
  C:\projects\bnb\bitsandbytes\backends\cpu\ops.py:132: FutureWarning: _check_is_size will be removed in a future PyTorch release along with guard_size_oblivious.     Use _check(i >= 0) instead.
    torch._check_is_size(blocksize)

-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
=================================== PASSES ====================================
= 157 failed, 3680 passed, 163 skipped, 182 deselected, 24 xfailed, 3278 warnings in 4296.71s (1:11:36) =