Minor updates to DMS and KVzap

SimJeg · SimJeg · commit 8dd46229cb74 · 2026-01-26T12:47:53.000Z
Signed-off-by: SimJeg &lt;sjegou@nvidia.com&gt;
diff --git a/README.md b/README.md
@@ -150,7 +150,7 @@ Finally we provide wrapper presses that can be combined with other presses:
 - `BlockPress` ([source](kvpress/presses/block_press.py), [paper](https://arxiv.org/abs/2504.15364)): segments input sequence into non-overlapping blocks and compresses iteratively.
 - `DecodingPress` ([source](kvpress/presses/decoding_press.py)): allows for compression during decoding, see decoding section in this README.
 - `PrefillDecodingPress` ([source](kvpress/presses/prefill_decoding_press.py)): allows to compress both during prefilling and during decoding.
-- `DMSPress` ([source](kvpress/presses/dms_press.py), [paper](https://arxiv.org/abs/2506.05345)): evict keys and values with scores below a given threshold of any `ScorerPress` instead of relying on top-k scores. Support both prefilling and decoding (if decoding=True).
+- `DMSPress` ([source](kvpress/presses/dms_press.py), [paper](https://arxiv.org/abs/2506.05345)): evict keys and values with scores below a given threshold of any `ScorerPress` instead of relying on top-k scores. Support both prefilling and decoding (if decoding=True), but only supports dense-prefill and not sparse-prefill.
 
 For a detailed list of existing KV cache compression methods, check [Awesome-KV-Cache-Compression](https://github.com/October2001/Awesome-KV-Cache-Compression) or [Awesome-LLM-Compression](https://github.com/HuangOwen/Awesome-LLM-Compression?tab=readme-ov-file#kv-cache-compression)
 
diff --git a/kvpress/presses/dms_press.py b/kvpress/presses/dms_press.py
@@ -17,6 +17,7 @@ class DMSPress(BasePress):
     """
     Based on Dynamic Memory Sparsification (DMS, https://arxiv.org/abs/2506.05345) inference.
     Wraps a ScorerPress and evicts keys/values with scores below a given threshold.
+    This press implements a dense-prefill version of DMS, not the sparse-prefill version.
 
     Unlike most presses that use a fixed compression_ratio, DMSPress uses a score threshold
     to determine which KV pairs to evict. This allows for adaptive compression where the actual
diff --git a/kvpress/presses/kvzap_press.py b/kvpress/presses/kvzap_press.py
@@ -24,6 +24,7 @@ class KVzapModel(PreTrainedModel):
 
     def __init__(self, config):
         super().__init__(config)
+        self.all_tied_weights_keys = {}
         if config.hidden_dim is None:
             # Linear model
             self.layers = nn.ModuleList(
@@ -72,8 +73,7 @@ def score(
         attentions: torch.Tensor,
         kwargs: dict,
     ) -> torch.Tensor:
-        module = self.kvzap_model.layers[module.layer_idx]
-        module = module.to(hidden_states.device, dtype=hidden_states.dtype).eval()
-        with torch.no_grad():
-            scores = module(hidden_states).transpose(1, 2)
+        kvzap_module = self.kvzap_model.layers[module.layer_idx]
+        kvzap_module = kvzap_module.to(hidden_states.device, dtype=hidden_states.dtype).eval()
+        scores = kvzap_module(hidden_states).transpose(1, 2)
         return scores
diff --git a/kvzap/data.py b/kvzap/data.py
@@ -201,7 +201,7 @@ def _forward_hook(self, module, input, kwargs, output):
             scale = scale.repeat_interleave(module.o_proj.block_size[0], dim=0)
             scale = scale.repeat_interleave(module.o_proj.block_size[1], dim=1)
             Wo = Wo.to(V.dtype) * scale
-        Wo = Wo.view(module.config.num_attention_heads, module.head_dim, module.config.hidden_size)
+        Wo = Wo.view(module.config.num_attention_heads, V.shape[-1], module.config.hidden_size)
         WoV_norm = torch.einsum("h i j, b h t i -> b h t j", Wo.to(dtype=V.dtype), V).norm(dim=-1)
         scores = torch.einsum("b h t i, b h i -> b h t i", scores, WoV_norm)
 
diff --git a/kvzap/train.py b/kvzap/train.py
@@ -106,17 +106,21 @@ def train_linear(X: torch.Tensor, y: torch.Tensor) -> KVzapModel:
     # Train a linear model for each layer
     params = []
     for layer_idx in tqdm(range(X.shape[1]), desc="Training linear models"):
+        X_train = X[:, layer_idx].clone().to(torch.float32).numpy()
+        y_train = y[:, layer_idx].clone().to(torch.float32).numpy()
         linear = Ridge()
-        linear.fit(X[:, layer_idx].float(), y[:, layer_idx].float())
+        linear.fit(X_train, y_train)
         params.append((linear.coef_, linear.intercept_))
 
     # Load the parameters into a KVzapModel
     linear_model = KVzapModel(
         KVzapConfig(input_dim=X.shape[2], hidden_dim=None, output_dim=y.shape[2], n_modules=X.shape[1])
     )
     for layer_idx, (W, b) in enumerate(params):
-        linear_model.layers[layer_idx].weight.data = torch.tensor(W, dtype=X.dtype)  # type: ignore[index]
-        linear_model.layers[layer_idx].bias.data = torch.tensor(b, dtype=X.dtype)  # type: ignore[index]
+        W = torch.tensor(np.atleast_2d(W), dtype=X.dtype)
+        b = torch.tensor(np.atleast_1d(b), dtype=X.dtype)
+        linear_model.layers[layer_idx].weight.data = W  # type: ignore[index]
+        linear_model.layers[layer_idx].bias.data = b  # type: ignore[index]
     return linear_model