|
8 | 8 | from ..._ops import register_kernel |
9 | 9 | from ...cextension import lib |
10 | 10 |
|
| 11 | +# torch._int_mm for s8@s8->s32 is supported on CPU from torch 2.4+. |
| 12 | +# However, we can overflow if we use this without AVX512_VNNI support. |
| 13 | +# This is fixed in torch 2.6+, so we set this as the minimum to be safe. |
| 14 | +# For more information: https://github.com/pytorch/pytorch/pull/136942 |
| 15 | +# TODO(matthewdouglas): aarch64? |
| 16 | +if torch.__version__ >= (2, 6): |
11 | 17 |
|
12 | | -@register_kernel("bitsandbytes::int8_linear_matmul", "cpu") |
13 | | -def _(A: torch.Tensor, B: torch.Tensor): |
14 | | - return _int8_linear_matmul_impl(A, B) |
15 | | - |
16 | | - |
17 | | -@register_kernel("bitsandbytes::int8_linear_matmul.out", "cpu") |
18 | | -def _(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor): |
19 | | - torch._check(out.dtype == torch.int32) |
20 | | - _int8_linear_matmul_impl(A, B, out) |
21 | | - |
22 | | - |
23 | | -def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: Optional[torch.Tensor] = None): |
24 | | - # Naive implementation: perform matmul in fp32 |
25 | | - result = torch.matmul(A.float(), B.float().t()).to(torch.int32) |
26 | | - if out is not None: |
27 | | - result = out.copy_(result) |
28 | | - return result |
| 18 | + @register_kernel("bitsandbytes::int8_linear_matmul", "cpu") |
| 19 | + def _(A: torch.Tensor, B: torch.Tensor): |
| 20 | + return torch._int_mm( |
| 21 | + A.reshape(-1, A.shape[-1]), |
| 22 | + B.t(), |
| 23 | + ).reshape(*A.shape[:-1], B.shape[0]) |
29 | 24 |
|
30 | 25 |
|
31 | 26 | @register_kernel("bitsandbytes::int8_mm_dequant", "cpu") |
@@ -92,3 +87,56 @@ def _(A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, |
92 | 87 | ) |
93 | 88 |
|
94 | 89 | return out |
| 90 | + |
| 91 | + |
| 92 | +_NF4_QUANT_TABLE = torch.tensor( |
| 93 | + [ |
| 94 | + -1.0, |
| 95 | + -0.6961928009986877, |
| 96 | + -0.5250730514526367, |
| 97 | + -0.39491748809814453, |
| 98 | + -0.28444138169288635, |
| 99 | + -0.18477343022823334, |
| 100 | + -0.09105003625154495, |
| 101 | + 0.0, |
| 102 | + 0.07958029955625534, |
| 103 | + 0.16093020141124725, |
| 104 | + 0.24611230194568634, |
| 105 | + 0.33791524171829224, |
| 106 | + 0.44070982933044434, |
| 107 | + 0.5626170039176941, |
| 108 | + 0.7229568362236023, |
| 109 | + 1.0, |
| 110 | + ], |
| 111 | + dtype=torch.float32, |
| 112 | + device="cpu", |
| 113 | +) |
| 114 | + |
| 115 | + |
| 116 | +@register_kernel("bitsandbytes::quantize_4bit", "cpu") |
| 117 | +def _( |
| 118 | + A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype |
| 119 | +) -> Tuple[torch.Tensor, torch.Tensor]: |
| 120 | + torch._check_is_size(blocksize) |
| 121 | + torch._check(quant_type == "nf4", lambda: f"quant_type must be nf4 on CPU, got {quant_type}") |
| 122 | + |
| 123 | + n = A.numel() |
| 124 | + |
| 125 | + # TODO: Support when weight matrix is not divisible by blocksize |
| 126 | + torch._check(n % blocksize == 0, lambda: f"n must be divisible by blocksize, got {n} and {blocksize}") |
| 127 | + |
| 128 | + # Divide into blocks and normalize |
| 129 | + blocks = A.reshape(-1, blocksize) |
| 130 | + absmax = blocks.abs().max(dim=1).values.float() |
| 131 | + scaled = blocks / absmax.unsqueeze(-1) |
| 132 | + |
| 133 | + # Quantize with the lookup table |
| 134 | + quantized = torch.argmin(torch.abs(scaled.view(-1, 1) - _NF4_QUANT_TABLE), dim=-1, keepdim=True).to(torch.uint8) |
| 135 | + |
| 136 | + # Pack two quantized values per byte |
| 137 | + packed = quantized[::2] << 4 | quantized[1::2] |
| 138 | + |
| 139 | + if quant_storage != torch.uint8: |
| 140 | + packed = packed.squeeze().view(quant_storage).unsqueeze(1) |
| 141 | + |
| 142 | + return packed, absmax.float() |
0 commit comments