NVIDIA · giulio98 · Mar 12, 2025 · Mar 12, 2025 · Mar 13, 2025 · Mar 14, 2025
diff --git a/README.md b/README.md
@@ -67,6 +67,7 @@ Several presses inherit from `ScorerPress` ([source](kvpress/presses/scorer_pres
 - `TOVAPress` ([source](kvpress/presses/tova_press.py), [paper](https://arxiv.org/abs/2401.06104)): attention weight of the last query averaged across heads 
 - `ObservedAttentionPress` ([source](kvpress/presses/observed_attention_press.py), [paper](https://arxiv.org/abs/2306.14048)): average attention weight observed during in pre-filling phase
 - `QFilterPress` ([source](kvpress/presses/qfilter_press.py), [paper](https://arxiv.org/abs/2503.02812)): project the Key representations on the main SVD component of the Query vectors to approximate the attention scores.
+- `FinchPress` (([source](kvpress/presses/finch_press.py)), [paper](https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00716/125280)): average attention weight of the prompt.
 
 Some presses rely on a different logic:
 - `ThinKPress` ([source](kvpress/presses/think_press.py), [paper](https://arxiv.org/pdf/2407.21018)): compress the dimensions of the keys based on the channel attention score on the last queries 
@@ -82,7 +83,6 @@ Finally we provide wrapper presses that can be combined with other presses:
 - `ChunkPress` ([source](kvpress/presses/chunk_press.py), [paper](https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00716/125280)): compress the KV cache on each sequence chunk separately. This can yield to more uniform compression across long sequences
 - `CriticalKVPress` and `CriticalAdaKVPress` ([source](kvpress/presses/criticalkv_press.py), [paper](https://arxiv.org/abs/2502.03805)): refine the scores using the L1 norm of Wo @ values, coupled with a two-stage selection.
 
-
 For a detailed list of existing KV cache compression methods, check [Awesome-KV-Cache-Compression](https://github.com/October2001/Awesome-KV-Cache-Compression) or [Awesome-LLM-Compression](https://github.com/HuangOwen/Awesome-LLM-Compression?tab=readme-ov-file#kv-cache-compression)
 
 ## Evaluation

diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py
@@ -27,14 +27,15 @@
     CriticalKVPress,
     DuoAttentionPress,
     ExpectedAttentionPress,
+    FinchPress,
     KnormPress,
     ObservedAttentionPress,
+    QFilterPress,
     RandomPress,
     SnapKVPress,
     StreamingLLMPress,
     ThinKPress,
     TOVAPress,
-    QFilterPress,
 )
 
 logger = logging.getLogger(__name__)
@@ -76,6 +77,7 @@
     "think": ThinKPress(),
     "tova": TOVAPress(),
     "duo_attention": DuoAttentionPress(),
+    "finch": FinchPress(),
     "duo_attention_on_the_fly": DuoAttentionPress(on_the_fly_scoring=True),
     "chunkkv": ChunkKVPress(press=SnapKVPress(), chunk_length=20),
     "qfilter": QFilterPress(),

diff --git a/evaluation/longbench/calculate_metrics.py b/evaluation/longbench/calculate_metrics.py
@@ -4,6 +4,7 @@
 import re
 import string
 from collections import Counter
+
 import numpy as np
 from rouge import Rouge
 

diff --git a/kvpress/__init__.py b/kvpress/__init__.py
@@ -12,18 +12,19 @@
 from kvpress.presses.criticalkv_press import CriticalAdaKVPress, CriticalKVPress
 from kvpress.presses.duo_attention_press import DuoAttentionPress
 from kvpress.presses.expected_attention_press import ExpectedAttentionPress
+from kvpress.presses.finch_press import FinchPress
 from kvpress.presses.key_rerotation_press import KeyRerotationPress
 from kvpress.presses.knorm_press import KnormPress
 from kvpress.presses.observed_attention_press import ObservedAttentionPress
 from kvpress.presses.per_layer_compression_press import PerLayerCompressionPress
+from kvpress.presses.qfilter_press import QFilterPress
 from kvpress.presses.random_press import RandomPress
 from kvpress.presses.scorer_press import ScorerPress
 from kvpress.presses.simlayerkv_press import SimLayerKVPress
 from kvpress.presses.snapkv_press import SnapKVPress
 from kvpress.presses.streaming_llm_press import StreamingLLMPress
 from kvpress.presses.think_press import ThinKPress
 from kvpress.presses.tova_press import TOVAPress
-from kvpress.presses.qfilter_press import QFilterPress
 
 # Patch the attention functions to support head-wise compression
 patch_attention_functions()
@@ -49,6 +50,7 @@
     "KeyRerotationPress",
     "ChunkPress",
     "DuoAttentionPress",
+    "FinchPress",
     "ChunkKVPress",
     "QFilterPress",
 ]
diff --git a/kvpress/pipeline.py b/kvpress/pipeline.py
@@ -12,6 +12,7 @@
 from transformers.pipelines.base import GenericTensor
 
 from kvpress.presses.base_press import BasePress
+from kvpress.presses.finch_press import FinchPress
 from kvpress.presses.key_rerotation_press import KeyRerotationPress
 from kvpress.presses.observed_attention_press import ObservedAttentionPress
 from kvpress.presses.per_layer_compression_press import PerLayerCompressionPress
@@ -161,6 +162,14 @@ def _forward(
         context_ids = input_tensors["context_ids"].to(self.model.device)
         context_length = context_ids.shape[1]
 
+        if isinstance(press, FinchPress) or isinstance(getattr(press, "press", None), FinchPress):
+            # finch press cannot be done with multiple questions
+            assert len(input_tensors["questions_ids"]) == 1, "Finch press cannot be done with multiple questions"
+            question_ids = input_tensors["questions_ids"][0].to(self.model.device)
+            context_ids = torch.cat((context_ids, question_ids[:, :-1]), dim=1)
+            press.condition_len = len(question_ids[:, :-1][0])
+            input_tensors["questions_ids"][0] = question_ids[:, -1:]
+
         # Prefilling using the press on the context
         if cache is None:
             cache = DynamicCache()
@@ -182,7 +191,9 @@ def _forward(
             answer = self.generate_answer(
                 question_ids=question_ids.to(self.model.device),
                 cache=cache,
-                context_length=(cache.get_seq_length() if isinstance(press, KeyRerotationPress) else context_length),
+                context_length=(
+                    cache.get_seq_length() if isinstance(press, (KeyRerotationPress, FinchPress)) else context_length
+                ),
                 max_new_tokens=max_new_tokens,
             )
             answers.append(answer)

diff --git a/kvpress/presses/duo_attention_press.py b/kvpress/presses/duo_attention_press.py
@@ -1,14 +1,14 @@
 # SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-from cachetools import cached, LRUCache  # type: ignore[import-untyped]
 from contextlib import contextmanager
 from dataclasses import dataclass, field
 from io import StringIO
 
 import numpy as np
 import requests  # type: ignore[import-untyped]
 import torch
+from cachetools import LRUCache, cached  # type: ignore[import-untyped]
 from datasets import load_dataset
 from transformers import AutoTokenizer
 from transformers.models.llama.modeling_llama import apply_rotary_pos_emb