forked from NVIDIA/kvpress
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdefault_presses.py
More file actions
63 lines (57 loc) · 2.2 KB
/
default_presses.py
File metadata and controls
63 lines (57 loc) · 2.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import numpy as np
from kvpress import (
DuoAttentionPress,
ExpectedAttentionPress,
FinchPress,
KnormPress,
QFilterPress,
RandomPress,
SimLayerKVPress,
SnapKVPress,
StreamingLLMPress,
ThinKPress,
TOVAPress,
)
class TestDuoAttentionPress(DuoAttentionPress):
@staticmethod
def load_attention_pattern(model):
n_layers, n_heads = model.config.num_hidden_layers, model.config.num_key_value_heads
return 2, 2, np.random.rand(n_layers, n_heads)
# contains all presses to be tested
# kwargs should be ordered easy to hard compression
default_presses = [
{"cls": TestDuoAttentionPress, "kwargs": [{"head_compression_ratio": 0.2}, {"head_compression_ratio": 0.8}]},
{"cls": KnormPress, "kwargs": [{"compression_ratio": 0.2}, {"compression_ratio": 0.8}]},
{"cls": ExpectedAttentionPress, "kwargs": [{"compression_ratio": 0.2}, {"compression_ratio": 0.8}]},
{"cls": RandomPress, "kwargs": [{"compression_ratio": 0.2}, {"compression_ratio": 0.8}]},
{"cls": StreamingLLMPress, "kwargs": [{"compression_ratio": 0.2}, {"compression_ratio": 0.8}]},
{"cls": QFilterPress, "kwargs": [{"compression_ratio": 0.2}, {"compression_ratio": 0.8}]},
{
"cls": SnapKVPress,
"kwargs": [{"compression_ratio": 0.2, "window_size": 2}, {"compression_ratio": 0.8, "window_size": 2}],
},
{"cls": TOVAPress, "kwargs": [{"compression_ratio": 0.2}, {"compression_ratio": 0.8}]},
{
"cls": ThinKPress,
"kwargs": [
{"key_channel_compression_ratio": 0.2, "window_size": 2},
{"key_channel_compression_ratio": 0.8, "window_size": 2},
],
},
{
"cls": SimLayerKVPress,
"kwargs": [
{"lazy_threshold": 0.8, "n_initial": 1, "n_recent": 1, "n_last": 1},
{"lazy_threshold": 0.2, "n_initial": 1, "n_recent": 1, "n_last": 1},
],
},
{
"cls": FinchPress,
"kwargs": [
{"compression_ratio": 0.2, "condition_len": 2},
{"compression_ratio": 0.8, "condition_len": 2},
],
},
]