Add KeyDiffPress (#86)

figuremout · web-flow · commit 3871dde356dc · 2025-07-02T11:14:11.000+02:00
diff --git a/README.md b/README.md
@@ -77,6 +77,7 @@ Several presses inherit from `ScorerPress` ([source](kvpress/presses/scorer_pres
 - `QFilterPress` ([source](kvpress/presses/qfilter_press.py), [paper](https://arxiv.org/abs/2503.02812)): project the Key representations on the main SVD component of the Query vectors to approximate the attention scores.
 - `PyramidKVPress` ([source](kvpress/presses/pyramidkv_press.py), [paper](https://arxiv.org/abs/2406.02069)): maintain pyramid-like cache sizes, allocating more cache budget to lower layers and less to higher layers
 - `LagKVPress` ([source](kvpress/presses/lagkv_press.py), [paper](https://arxiv.org/abs/2504.04704)): leverage on the KV lag-relative information to compress. It's query free, attention-weight free, and flash-attention compatible.
+- `KeyDiffPress` ([source](kvpress/presses/keydiff_press.py), [paper](https://arxiv.org/abs/2504.15364)): evicts tokens based solely on key similarity.
 
 Some presses rely on a different logic:
 - `ThinKPress` ([source](kvpress/presses/think_press.py), [paper](https://arxiv.org/pdf/2407.21018)): compress the dimensions of the keys based on the channel attention score on the last queries 
@@ -92,7 +93,7 @@ Finally we provide wrapper presses that can be combined with other presses:
 - `ChunkKVPress` ([source](kvpress/presses/chunkkv_press.py), [paper](https://arxiv.org/abs/2502.00299)): compresses by selecting important chunks, preserving semantic coherence
 - `ChunkPress` ([source](kvpress/presses/chunk_press.py), [paper](https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00716/125280)): compress the KV cache on each sequence chunk separately. This can yield to more uniform compression across long sequences
 - `CriticalKVPress` and `CriticalAdaKVPress` ([source](kvpress/presses/criticalkv_press.py), [paper](https://arxiv.org/abs/2502.03805)): refine the scores using the L1 norm of Wo @ values, coupled with a two-stage selection.
-
+- `BlockPress` ([source](kvpress/presses/keydiff_press.py), [paper](https://arxiv.org/abs/2504.15364)): segments input sequence into non-overlapping blocks and compresses iteratively.
 
 For a detailed list of existing KV cache compression methods, check [Awesome-KV-Cache-Compression](https://github.com/October2001/Awesome-KV-Cache-Compression) or [Awesome-LLM-Compression](https://github.com/HuangOwen/Awesome-LLM-Compression?tab=readme-ov-file#kv-cache-compression)
 
diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py
@@ -37,6 +37,8 @@
     StreamingLLMPress,
     ThinKPress,
     TOVAPress,
+    BlockPress,
+    KeyDiffPress,
 )
 
 logger = logging.getLogger(__name__)
@@ -84,6 +86,8 @@
     "snap_think": ComposedPress([SnapKVPress(), ThinKPress()]),
     "pyramidkv": PyramidKVPress(),
     "finch": FinchPress(),
+    "keydiff": KeyDiffPress(),
+    "block_keydiff": BlockPress(press=KeyDiffPress(), block_size=128),
 }
 
 
diff --git a/kvpress/__init__.py b/kvpress/__init__.py
@@ -28,6 +28,8 @@
 from kvpress.presses.finch_press import FinchPress
 from kvpress.presses.lagkv_press import LagKVPress
 from kvpress.presses.base_press import SUPPORTED_MODELS
+from kvpress.presses.block_press import BlockPress
+from kvpress.presses.keydiff_press import KeyDiffPress
 
 # Patch the attention functions to support head-wise compression
 patch_attention_functions()
@@ -58,4 +60,6 @@
     "PyramidKVPress",
     "FinchPress",
     "LagKVPress",
+    "BlockPress",
+    "KeyDiffPress",
 ]
diff --git a/kvpress/presses/block_press.py b/kvpress/presses/block_press.py
@@ -0,0 +1,84 @@
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from dataclasses import dataclass
+
+import torch
+from torch import nn
+
+from kvpress.presses.base_press import BasePress
+from kvpress.presses.scorer_press import ScorerPress
+
+
+@dataclass
+class BlockPress(BasePress):
+    """
+    Simulates block prompt processing described in the KeyDiff (https://arxiv.org/abs/2504.15364).
+    Segments input sequence into non-overlapping blocks and compresses iteratively.
+    Keeps limited memory overhead for long context inference.
+    """
+
+    press: ScorerPress
+    block_size: int = 128
+
+    def __post_init__(self):
+        assert isinstance(self.press, ScorerPress), "BlockPress requires a ScorerPress"
+
+    @property
+    def compression_ratio(self):
+        return self.press.compression_ratio
+
+    @compression_ratio.setter
+    def compression_ratio(self, value):
+        self.press.compression_ratio = value
+
+    def compress(
+        self,
+        module: nn.Module,
+        hidden_states: torch.Tensor,
+        keys: torch.Tensor,
+        values: torch.Tensor,
+        attentions: torch.Tensor,
+        kwargs: dict,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.press.compression_ratio == 0:
+            return keys, values
+
+        assert attentions is None, "BlockPress does not support attentions."
+
+        bsz, num_key_value_heads, q_len, head_dim = keys.shape
+
+        block_size = self.block_size if self.block_size < q_len else q_len
+        n_kept = int(q_len * (1 - self.compression_ratio))
+
+        kept_indices = torch.arange(n_kept, device=keys.device).expand(bsz, num_key_value_heads, -1)
+
+        # Reshape hidden states to match the kept_indices
+        states = hidden_states.view(bsz, q_len, num_key_value_heads, -1).transpose(1, 2)
+
+        for i in range(n_kept, q_len, block_size):
+            end = min(i + block_size, q_len)
+            current_indices = torch.arange(i, end, device=keys.device).expand(bsz, num_key_value_heads, -1)
+            current_indices = torch.cat([kept_indices, current_indices], dim=-1)
+
+            # Gather hidden states for the selected indices, then restore the shape
+            # Check tests/presses/test_block_press.py for correctness verification of gathered hidden states
+            current_states = states.gather(2, current_indices.unsqueeze(-1).expand(-1, -1, -1, states.shape[-1]))
+            current_states = current_states.transpose(1, 2).reshape(bsz, -1, hidden_states.shape[-1])
+
+            scores = self.press.score(
+                module,
+                current_states,
+                keys.gather(2, current_indices.unsqueeze(-1).expand(-1, -1, -1, head_dim)),
+                values.gather(2, current_indices.unsqueeze(-1).expand(-1, -1, -1, head_dim)),
+                attentions,
+                kwargs,
+            )
+            topk_indices = scores.topk(n_kept, dim=-1).indices
+            kept_indices = current_indices.gather(-1, topk_indices)
+
+        kept_indices = kept_indices.unsqueeze(-1).expand(-1, -1, -1, head_dim)
+        keys = keys.gather(2, kept_indices).contiguous()
+        values = values.gather(2, kept_indices).contiguous()
+
+        return keys, values
diff --git a/kvpress/presses/keydiff_press.py b/kvpress/presses/keydiff_press.py
@@ -0,0 +1,28 @@
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from dataclasses import dataclass
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from kvpress.presses.scorer_press import ScorerPress
+
+
+@dataclass
+class KeyDiffPress(ScorerPress):
+    """
+    KeyDiff (https://arxiv.org/abs/2504.15364) evict tokens based solely on key similarity.
+    """
+    def score(
+        self,
+        module: nn.Module,
+        hidden_states: torch.Tensor,
+        keys: torch.Tensor,
+        values: torch.Tensor,
+        attentions: torch.Tensor,
+        kwargs,
+    ) -> torch.Tensor:
+        anchor = F.normalize(keys, p=2, dim=-1).mean(dim=2, keepdim=True)
+        return -F.cosine_similarity(keys, anchor, dim=-1)
diff --git a/tests/default_presses.py b/tests/default_presses.py
@@ -16,6 +16,7 @@
     QFilterPress,
     PyramidKVPress,
     LagKVPress,
+    KeyDiffPress,
 )
 
 
@@ -65,4 +66,5 @@ def load_attention_pattern(model):
             {"compression_ratio": 0.8, "n_sink": 16, "lag_size": 128}
         ],
     },
+    {"cls": KeyDiffPress, "kwargs": [{"compression_ratio": 0.2}, {"compression_ratio": 0.8}]},
 ]
diff --git a/tests/presses/test_block_press.py b/tests/presses/test_block_press.py
@@ -0,0 +1,64 @@
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from dataclasses import dataclass
+
+import torch
+import torch.nn as nn
+from transformers import DynamicCache
+
+from kvpress.presses.scorer_press import ScorerPress
+from kvpress.presses.block_press import BlockPress
+
+from tests.fixtures import unit_test_model  # noqa: F401
+
+
+@dataclass
+class HiddenStatesPress(ScorerPress):  # dummy press using hidden states
+
+    def score(
+        self,
+        module: nn.Module,
+        hidden_states: torch.Tensor,
+        keys: torch.Tensor,
+        values: torch.Tensor,
+        attentions: torch.Tensor,
+        kwargs,
+    ) -> torch.Tensor:
+        return hidden_states.mean(-1).unsqueeze(1).expand_as(keys.norm(dim=-1))
+
+
+def test_block_press_is_streaming_top_k(unit_test_model):  # noqa: F811
+    """
+    Test that BlockPress correctly applies the compression ratio and keeps the output consistent.
+    """
+    press = HiddenStatesPress(compression_ratio=0.5)
+    generator = torch.Generator().manual_seed(0)
+    input_ids = torch.randint(0, 1024, (1, 256), generator=generator)
+    keys_hash = []
+    values_hash = []
+
+    for block_size in [2, 4, 8, 128, 256]:
+        composed_press = BlockPress(press=press, block_size=block_size)
+        with composed_press(unit_test_model):
+            cache = DynamicCache()
+            unit_test_model(input_ids, past_key_values=cache).past_key_values
+            assert cache.get_seq_length() == 128
+            keys = cache.key_cache
+            values = cache.value_cache
+            keys_hash.append(torch.cat(keys).sum().item())
+            values_hash.append(torch.cat(values).sum().item())
+
+    with press(unit_test_model):
+        cache = DynamicCache()
+        unit_test_model(input_ids, past_key_values=cache).past_key_values
+        assert cache.get_seq_length() == 128
+        keys = cache.key_cache
+        values = cache.value_cache
+        keys_hash.append(torch.cat(keys).sum().item())
+        values_hash.append(torch.cat(values).sum().item())
+
+    keys_tensor = torch.tensor(keys_hash)
+    values_tensor = torch.tensor(values_hash)
+    assert torch.allclose(keys_tensor, keys_tensor[-1])
+    assert torch.allclose(values_tensor, values_tensor[-1])

Original file line number	Diff line number	Diff line change
`@@ -37,6 +37,8 @@`
`37`	`37`	`StreamingLLMPress,`
`38`	`38`	`ThinKPress,`
`39`	`39`	`TOVAPress,`
	`40`	`+ BlockPress,`
	`41`	`+ KeyDiffPress,`
`40`	`42`	`)`
`41`	`43`
`42`	`44`	`logger = logging.getLogger(__name__)`
`@@ -84,6 +86,8 @@`
`84`	`86`	`"snap_think": ComposedPress([SnapKVPress(), ThinKPress()]),`
`85`	`87`	`"pyramidkv": PyramidKVPress(),`
`86`	`88`	`"finch": FinchPress(),`
	`89`	`+ "keydiff": KeyDiffPress(),`
	`90`	`+ "block_keydiff": BlockPress(press=KeyDiffPress(), block_size=128),`
`87`	`91`	`}`
`88`	`92`
`89`	`93`
Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@`
`16`	`16`	`QFilterPress,`
`17`	`17`	`PyramidKVPress,`
`18`	`18`	`LagKVPress,`
	`19`	`+ KeyDiffPress,`
`19`	`20`	`)`
`20`	`21`
`21`	`22`
`@@ -65,4 +66,5 @@ def load_attention_pattern(model):`
`65`	`66`	`{"compression_ratio": 0.8, "n_sink": 16, "lag_size": 128}`
`66`	`67`	`],`
`67`	`68`	`},`
	`69`	`+ {"cls": KeyDiffPress, "kwargs": [{"compression_ratio": 0.2}, {"compression_ratio": 0.8}]},`
`68`	`70`	`]`