Fix FinchPress for Qwen (#82)

alessiodevoto · web-flow · commit 97408ee258b5 · 2025-06-20T17:50:53.000+02:00
diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py
@@ -27,16 +27,16 @@
     CriticalKVPress,
     DuoAttentionPress,
     ExpectedAttentionPress,
+    FinchPress,
     KnormPress,
     ObservedAttentionPress,
+    PyramidKVPress,
+    QFilterPress,
     RandomPress,
     SnapKVPress,
     StreamingLLMPress,
     ThinKPress,
     TOVAPress,
-    QFilterPress,
-    PyramidKVPress,
-    FinchPress,
 )
 
 logger = logging.getLogger(__name__)
@@ -197,7 +197,11 @@ def evaluate(
         pipe = pipeline("kv-press-text-generation", model=model, device=device, model_kwargs=model_kwargs)
 
     if isinstance(press, FinchPress):
-        df["context"] = df["context"] + pipe.tokenizer.bos_token
+        assert compress_questions is True, "FinchPress requires compress_questions to be set to True"
+        # FinchPress uses a delimiter token to separate context and question
+        # So we need to update the tokenizer and the model embeddings.
+        press.update_model_and_tokenizer(pipe.model, pipe.tokenizer)
+        df["context"] = df["context"] + press.delimiter_token
 
     if compress_questions:
         df["context"] = df["context"] + df["question"]
diff --git a/kvpress/presses/finch_press.py b/kvpress/presses/finch_press.py
@@ -2,15 +2,15 @@
 # SPDX-License-Identifier: Apache-2.0
 
 
-from dataclasses import dataclass, field
 from contextlib import contextmanager
+from dataclasses import dataclass, field
 
 import torch
 from torch.nn import functional as F
+from transformers.models.llama.modeling_llama import rotate_half
 
 from kvpress.presses.base_press import BasePress
 from kvpress.presses.snapkv_press import SnapKVPress
-from transformers.models.llama.modeling_llama import rotate_half
 
 
 @dataclass
@@ -20,7 +20,9 @@ class FinchPress(BasePress):
     without chunked prefilling.
 
     Finch starts with SnapKV-style compression, but the window size is not fixed. Instead, the user must provide
-    a second <bos_token> between the context and the window (input = context + tokenizer.bos_token + question)
+    a delimiter token between the context and the window (input = context + delimiter_token + question).
+    The delimiter token is set by the user via the update_model_and_tokenizer method.
+
 
     The options are also available
     - normalizing scores using the number of non-zero attention weights in the window
@@ -32,6 +34,8 @@ class FinchPress(BasePress):
     chunk_length: int = None
     normalize_scores: bool = True
     rerotate_keys: bool = True
+    delimiter_token: str = field(default=None, init=False)  # To be set by the update_model_and_tokenizer method
+    delimiter_token_id: int = field(default=None, init=False)  # To be set by the update_model_and_tokenizer method
     window_size: int = field(default=None, init=False)
 
     def score(self, module, hidden_states, keys, values, attentions, kwargs):
@@ -109,23 +113,40 @@ def compress(self, module, hidden_states, keys, values, attentions, kwargs):
 
     def embed_token_forward_hook(self, module, input, output):
         """
-        Forward hook to detect a second <bos_token> delimiting the context and the window
+        Forward hook to detect a delimiter token between the context and the window
         """
-        if input[0][0, 0] == self.bos_token_id:  # prefilling
+        if input[0].shape[1] > 1 and self.delimiter_token_id in input[0][0]:  # prefilling
             assert len(input[0]) == 1, "Only batch size 1 is supported."
-            try:
-                context_length = int(torch.nonzero(input[0][0] == self.bos_token_id)[1].item())
-                self.window_size = len(input[0][0]) - 1 - context_length
-                assert self.window_size > 0, "No window detected (window size must be > 0)."
-                # Remove the second <bos_token> from the output
-                output = torch.cat([output[:, :context_length], output[:, context_length + 1 :]], dim=1)
-            except IndexError:
-                raise IndexError("A second <bos_token> must delimit the context and the question.")
+            # Find the delimiter token and compute the window size
+            delim_tokens = input[0][0] == self.delimiter_token_id
+            assert delim_tokens.sum() == 1, "Only one delimiter token should be present."
+            context_length = int(torch.nonzero(delim_tokens)[0].item())
+            self.window_size = len(input[0][0]) - 1 - context_length
+            assert self.window_size > 0, "No window detected (window size must be > 0)."
+            # Remove the delimiter token from the output
+            output = output[:, ~delim_tokens]
         return output
 
+    def update_model_and_tokenizer(self, model, tokenizer, delimiter_token : str = "<|finch_sep|>"):
+        """
+        Set the delimiter token and update the tokenizer accordingly.
+        This method should be called before calling the press.
+        """
+        self.delimiter_token = delimiter_token
+        if delimiter_token not in tokenizer.get_vocab():
+            tokenizer.add_special_tokens({"additional_special_tokens": [delimiter_token]})
+        self.delimiter_token_id = tokenizer.convert_tokens_to_ids(delimiter_token)  # type: ignore
+        # update model embeddings
+        model.resize_token_embeddings(len(tokenizer))
+        return tokenizer
+
     @contextmanager
     def __call__(self, model):
-        self.bos_token_id = model.generation_config.bos_token_id
+        # The user should set the delimiter_token_id before calling the press.
+        if self.delimiter_token_id is None:
+            raise ValueError("""No delimiter token ID provided.
+                             Use the update_model_and_tokenizer method before calling the press.""")
+
         with super().__call__(model):
             try:
                 hook = model.model.embed_tokens.register_forward_hook(self.embed_token_forward_hook)
diff --git a/tests/presses/test_finch_press.py b/tests/presses/test_finch_press.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import torch
+
 from kvpress import FinchPress
 from tests.fixtures import unit_test_model  # noqa: F401
 
@@ -13,9 +14,8 @@ def test_finch_press(unit_test_model):  # noqa: F811
         FinchPress(0.5, normalize_scores=False),
         FinchPress(0.2, chunk_length=5),
     ]:
+        press.delimiter_token_id = unit_test_model.config.eos_token_id
         with press(unit_test_model):
-            bos = unit_test_model.generation_config.bos_token_id
             input_ids = torch.arange(10, 20)
-            input_ids[0] = bos
-            input_ids[8] = bos
+            input_ids[8] = press.delimiter_token_id
             unit_test_model(input_ids.unsqueeze(0))
diff --git a/tests/presses/test_pyramidkv_press.py b/tests/presses/test_pyramidkv_press.py
@@ -2,9 +2,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
-from kvpress.presses.pyramidkv_press import PyramidKVPress
 import torch.nn as nn
 
+from kvpress.presses.pyramidkv_press import PyramidKVPress
+
 
 class MockConfig:
     def __init__(self, num_hidden_layers):