Add DuoAttentionPress (#50)

SimJeg · web-flow · commit a94a78def37b · 2025-02-18T17:50:52.000+01:00
* Add DuoAttentionPress

* Fix tests and compression_ratio

* Address feedback

* Update plot

* Update version
diff --git a/README.md b/README.md
@@ -70,6 +70,7 @@ Several presses inherit from `ScorerPress` ([source](kvpress/presses/scorer_pres
 Some presses rely on a different logic:
 - `ThinKPress` ([source](kvpress/presses/think_press.py), [paper](https://arxiv.org/pdf/2407.21018)): compress the dimensions of the keys based on the channel attention score on the last queries 
 - `SimLayerKVPress` ([source](kvpress/presses/simlayerkv_press.py), [paper](https://arxiv.org/abs/2410.13846)): identify "lazy" layers, and apply the StreamingLLM approach to them 
+- `DuoAttentionPress` ([source](kvpress/presses/duo_attention_press.py), [paper](https://arxiv.org/abs/2410.10819)): split heads into retrieval heads (no compression) and streaming heads (StreamingLLM approach)
 
 Finally we provide wrapper presses that can be combined with other presses:
 - `AdaKVPress` ([source](kvpress/presses/adakv_press.py), [paper](https://arxiv.org/abs/2407.11550)): prune bottom scores of any `ScorerPress` but across all heads, achieving head-wise compressions 
diff --git a/evaluation/assets/ruler_llama_xkcd.png b/evaluation/assets/ruler_llama_xkcd.png
diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py
@@ -28,6 +28,7 @@
     StreamingLLMPress,
     ThinKPress,
     TOVAPress,
+    DuoAttentionPress,
 )
 
 logger = logging.getLogger(__name__)
@@ -62,6 +63,7 @@
     "streaming_llm": StreamingLLMPress(),
     "think": ThinKPress(),
     "tova": TOVAPress(),
+    "duo_attention": DuoAttentionPress(),
 }
 
 
@@ -139,7 +141,11 @@ def evaluate(
     # Load press
     assert press_name in PRESS_DICT
     press = PRESS_DICT[press_name]
-    press.compression_ratio = compression_ratio  # type:ignore[attr-defined]
+
+    if isinstance(press, (DuoAttentionPress)):
+        press.head_compression_ratio = compression_ratio
+    else:
+        press.compression_ratio = compression_ratio  # type:ignore[attr-defined]
 
     # Initialize pipeline with the correct attention implementation
     model_kwargs = {"torch_dtype": "auto"}
@@ -176,16 +182,18 @@ def evaluate(
             max_context_length=max_context_length,
         )
         df.loc[df_.index, "predicted_answer"] = output["answers"]
+        df.loc[df_.index, "compression_ratio"] = press.compression_ratio  # type:ignore[attr-defined]
         torch.cuda.empty_cache()
 
     # Save answers
-    df["predicted_answer"].to_csv(str(save_filename), index=False)
+    df[["predicted_answer", "compression_ratio"]].to_csv(str(save_filename), index=False)
 
     # Calculate metrics
     scorer = SCORER_DICT[dataset]
     metrics = scorer(df)
     with open(str(save_filename).replace(".csv", ".json"), "w") as f:
         json.dump(metrics, f)
+    print(f"Average compression ratio: {df['compression_ratio'].mean():.2f}")
     print(metrics)
 
 
diff --git a/kvpress/__init__.py b/kvpress/__init__.py
@@ -21,6 +21,8 @@
 from kvpress.presses.think_press import ThinKPress
 from kvpress.presses.tova_press import TOVAPress
 from kvpress.presses.criticalkv_press import CriticalKVPress, CriticalAdaKVPress
+from kvpress.presses.duo_attention_press import DuoAttentionPress
+
 # Patch the attention functions to support head-wise compression
 patch_attention_functions()
 
@@ -44,4 +46,5 @@
     "PerLayerCompressionPress",
     "KeyRerotationPress",
     "ChunkPress",
+    "DuoAttentionPress",
 ]
diff --git a/kvpress/presses/duo_attention_press.py b/kvpress/presses/duo_attention_press.py
@@ -0,0 +1,111 @@
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from io import StringIO
+from dataclasses import dataclass, field
+from contextlib import contextmanager
+
+import torch
+import requests  # type: ignore[import-untyped]
+import numpy as np
+
+from kvpress.presses.base_press import BasePress
+
+
+PATTERNS_DICT = {
+    "togethercomputer/Llama-2-7B-32K-Instruct": "Llama-2-7B-32K-Instruct/lr%3D0.02-reg%3D0.05-ctx%3D1000_32000-multi_passkey10",  # noqa: E501
+    "gradientai//Llama-3-8B-Instruct-Gradient-1048k": "Llama-3-8B-Instruct-Gradient-1048k/lr%3D0.02-reg%3D0.05-ctx%3D1000_32000-multi_passkey10",  # noqa: E501
+    "gradientai//Llama-3-8B-Instruct-Gradient-4194k": "Llama-3-8B-Instruct-Gradient-4194k/lr%3D0.02-reg%3D0.05-ctx%3D1000_32000-multi_passkey10",  # noqa: E501
+    "meta-llama/Meta-Llama-3.1-8B-Instruct": "Meta-Llama-3.1-8B-Instruct/lr=0.02-reg=0.05-ctx=1000_128000-multi_passkey10",  # noqa: E501
+    "mistralai/Mistral-7B-Instruct-v0.2": "Mistral-7B-Instruct-v0.2/lr%3D0.02-reg%3D0.05-ctx%3D1000_32000-multi_passkey10",  # noqa: E501
+    "mistralai/Mistral-7B-Instruct-v0.3": "Mistral-7B-Instruct-v0.3/lr%3D0.02-reg%3D0.05-ctx%3D1000_32000-multi_passkey10",  # noqa: E501
+}
+
+
+@dataclass
+class DuoAttentionPress(BasePress):
+    """
+    Implements DuoAttention (https://arxiv.org/abs/2410.10819)
+
+    Splits attention heads into two types:
+    - Retrieval heads: use the full KV cache
+    - Streaming heads: use only sink and recent tokens.
+
+    Head classification is based on scores loaded from https://github.com/mit-han-lab/duo-attention/
+    The higher the head_compression_ratio, the more streaming heads are used.
+    """
+
+    head_compression_ratio: float = 0.0
+    compression_ratio_: float = field(init=False, default=None)
+    recent_size: int = field(init=False, default=None)
+    sink_size: int = field(init=False, default=None)
+    streaming_mask: torch.Tensor = field(init=False, default=None)
+
+    def __post_init_from_model__(self, model):
+        """
+        Initialize sink_size, recent_size, and streaming_mask from a model
+        """
+        # Load attention pattern from the DuoAttention repo
+        self.sink_size, self.recent_size, head_scores = self.load_attention_pattern(model)
+
+        # Define retrieval and streaming heads through a binary mask
+        n_pruned = round(head_scores.size * self.head_compression_ratio)
+        self.streaming_mask = torch.zeros(head_scores.shape, dtype=bool, device=model.device)
+        if n_pruned > 0:
+            indices = np.argsort(head_scores, axis=None)[:n_pruned]
+            self.streaming_mask[np.unravel_index(indices, head_scores.shape)] = True
+
+    @property
+    def compression_ratio(self) -> float:
+        assert self.compression_ratio_ is not None, "Forward pass must be run to compute the compression ratio"
+        return self.compression_ratio_
+
+    @compression_ratio.setter
+    def compression_ratio(self, value):
+        raise AttributeError(f"compression ratio cannot be set for {type(self).__name__}")
+
+    def compress(self, module, hidden_states, keys, values, attentions, kwargs):
+
+        assert module.config._attn_implementation != "eager", "eager mode not supported"
+        q_len = hidden_states.shape[1]
+
+        if (self.head_compression_ratio > 0) or (q_len > (self.sink_size + self.recent_size)):
+
+            # Save indices to mask during the attention mechanism. Please refer to attention_patch.py for more details
+            masked_keys = torch.zeros_like(keys[..., 0], dtype=torch.bool)
+            masked_keys[:, self.streaming_mask[module.layer_idx], self.sink_size : -self.recent_size] = True
+            module.masked_key_indices = torch.nonzero(masked_keys, as_tuple=True)
+
+        # Compute the compression ratio
+        self.compression_ratio_ = self.streaming_mask.float().mean().item()
+        self.compression_ratio_ *= 1 - (self.sink_size + self.recent_size) / q_len
+
+        return keys, values
+
+    @staticmethod
+    def load_attention_pattern(model):
+        """
+        Load the attention pattern from the DuoAttention repo
+        """
+
+        assert (
+            model.config.name_or_path in PATTERNS_DICT
+        ), f"Checkpoint {model.config.name_or_path} not in {list(PATTERNS_DICT.keys())}"
+        base_url = "https://raw.githubusercontent.com/mit-han-lab/duo-attention/refs/heads/main/attn_patterns"
+        url = f"{base_url}/{PATTERNS_DICT[model.config.name_or_path]}/"
+
+        # Load config
+        config = requests.get(url + "config.json").json()
+
+        # Load head scores and clip as in duo_attn.utils.load_attn_pattern
+        text = requests.get(url + "full_attention_heads.tsv").text
+        head_scores = np.loadtxt(StringIO(text), dtype=float, delimiter="\t")
+        head_scores = np.clip(head_scores, 0, 1)
+
+        return config["sink_size"], config["recent_size"], head_scores
+
+    @contextmanager
+    def __call__(self, model):
+        self.__post_init_from_model__(model)
+        with super().__call__(model):
+            yield
diff --git a/pyproject.toml b/pyproject.toml
@@ -2,7 +2,7 @@
 name = "kvpress"
 authors = ["Simon Jegou", "Maximilian Jeblick", "Jiwei Liu", "David Austin"]
 description = "Efficiently compress the KV cache of any pretrained transformer"
-version = "0.2.2"
+version = "0.2.3"
 readme = "README.md"
 
 [tool.poetry.dependencies]
@@ -25,6 +25,7 @@ pandas = "^2.2.2"
 rouge = "^1.0.1"
 bert-score = "^0.3.13"
 accelerate = "^1.0.0"
+requests = "^2.32.3"
 
 [tool.poetry.dev-dependencies]
 pytest = "^7.0.0"
diff --git a/tests/default_presses.py b/tests/default_presses.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+import numpy as np
 
 from kvpress import (
     ExpectedAttentionPress,
@@ -11,11 +12,21 @@
     StreamingLLMPress,
     ThinKPress,
     TOVAPress,
+    DuoAttentionPress,
 )
 
+
+class TestDuoAttentionPress(DuoAttentionPress):
+    @staticmethod
+    def load_attention_pattern(model):
+        n_layers, n_heads = model.config.num_hidden_layers, model.config.num_key_value_heads
+        return 2, 2, np.random.rand(n_layers, n_heads)
+
+
 # contains all presses to be tested
 # kwargs should be ordered easy to hard compression
 default_presses = [
+    {"cls": TestDuoAttentionPress, "kwargs": [{"head_compression_ratio": 0.2}, {"head_compression_ratio": 0.8}]},
     {"cls": KnormPress, "kwargs": [{"compression_ratio": 0.2}, {"compression_ratio": 0.8}]},
     {"cls": ExpectedAttentionPress, "kwargs": [{"compression_ratio": 0.2}, {"compression_ratio": 0.8}]},
     {"cls": RandomPress, "kwargs": [{"compression_ratio": 0.2}, {"compression_ratio": 0.8}]},
diff --git a/tests/presses/test_duo_attention_press.py b/tests/presses/test_duo_attention_press.py
@@ -0,0 +1,7 @@
+from kvpress.presses.duo_attention_press import DuoAttentionPress, PATTERNS_DICT
+
+
+def test_load_attention_pattern():
+    for model_name in PATTERNS_DICT:
+        model = type("model", (), {"config": type("config", (), {"name_or_path": model_name})})()
+        DuoAttentionPress.load_attention_pattern(model)