default_presses.py 5.13 KB
Newer Older
chenzk's avatar
v1.0  
chenzk committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import numpy as np

from kvpress import (
    CompactorPress,
    CURPress,
    DuoAttentionPress,
    ExpectedAttentionPress,
    ExpectedAttentionStatsPress,
    FastKVzipPress,
    KeyDiffPress,
    KnormPress,
    KVzapPress,
    KVzipPress,
    LagKVPress,
    LeverageScorePress,
    NonCausalAttnPress,
    PyramidKVPress,
    QFilterPress,
    RandomPress,
    SimLayerKVPress,
    SnapKVPress,
    StreamingLLMPress,
    ThinKPress,
    TOVAPress,
)
from kvpress.presses.fastkvzip_press import FastKVzipGate
from kvpress.presses.kvzap_press import KVzapConfig, KVzapModel


class TestDuoAttentionPress(DuoAttentionPress):
    @staticmethod
    def load_attention_pattern(model):
        n_layers, n_heads = model.config.num_hidden_layers, model.config.num_key_value_heads
        return 2, 2, np.random.rand(n_layers, n_heads)


class TestKVzapPress(KVzapPress):
    """Test version of KVzapPress that creates a mock model instead of loading from HuggingFace."""

    def post_init_from_model(self, model):
        config = KVzapConfig(
            input_dim=model.config.hidden_size,
            output_dim=model.config.num_key_value_heads,
            hidden_dim=None,  # Use linear model for testing
            n_modules=model.config.num_hidden_layers,
        )
        self.kvzap_model = KVzapModel(config)


class TestFastKVzipPress(FastKVzipPress):
    """Test version of FastKVzipPress that creates a mock model instead of loading from HuggingFace."""

    def post_init_from_model(self, model):
        if self.gates is None:
            dtype = model.config.dtype
            input_dim = model.config.hidden_size
            ngroup = model.config.num_attention_heads // model.config.num_key_value_heads
            nhead = model.config.num_key_value_heads

            self.gates = []
            for idx in range(model.config.num_hidden_layers):
                module = FastKVzipGate(idx, input_dim, nhead, ngroup, dtype).to(model.device)
                self.gates.append(module)


# contains all presses to be tested
# kwargs should be ordered easy to hard compression
default_presses = [
    {"cls": TestDuoAttentionPress, "kwargs": [{"head_compression_ratio": 0.2}, {"head_compression_ratio": 0.8}]},
    {"cls": KnormPress, "kwargs": [{"compression_ratio": 0.2}, {"compression_ratio": 0.8}]},
    {"cls": ExpectedAttentionPress, "kwargs": [{"compression_ratio": 0.2}, {"compression_ratio": 0.8}]},
    {"cls": ExpectedAttentionStatsPress, "kwargs": [{"compression_ratio": 0.2}, {"compression_ratio": 0.8}]},
    {"cls": RandomPress, "kwargs": [{"compression_ratio": 0.2}, {"compression_ratio": 0.8}]},
    {"cls": StreamingLLMPress, "kwargs": [{"compression_ratio": 0.2}, {"compression_ratio": 0.8}]},
    {"cls": QFilterPress, "kwargs": [{"compression_ratio": 0.2}, {"compression_ratio": 0.8}]},
    {
        "cls": SnapKVPress,
        "kwargs": [{"compression_ratio": 0.2, "window_size": 2}, {"compression_ratio": 0.8, "window_size": 2}],
    },
    {"cls": TOVAPress, "kwargs": [{"compression_ratio": 0.2}, {"compression_ratio": 0.8}]},
    {
        "cls": ThinKPress,
        "kwargs": [
            {"key_channel_compression_ratio": 0.2, "window_size": 2},
            {"key_channel_compression_ratio": 0.8, "window_size": 2},
        ],
    },
    {
        "cls": SimLayerKVPress,
        "kwargs": [
            {"lazy_threshold": 0.8, "n_initial": 1, "n_recent": 1, "n_last": 1},
            {"lazy_threshold": 0.2, "n_initial": 1, "n_recent": 1, "n_last": 1},
        ],
    },
    {
        "cls": PyramidKVPress,
        "kwargs": [{"compression_ratio": 0.2, "window_size": 2}, {"compression_ratio": 0.8, "window_size": 2}],
    },
    {
        "cls": LagKVPress,
        "kwargs": [
            {"compression_ratio": 0.5, "n_sink": 16, "lag_size": 128},
            {"compression_ratio": 0.8, "n_sink": 16, "lag_size": 128},
        ],
    },
    {"cls": KeyDiffPress, "kwargs": [{"compression_ratio": 0.2}, {"compression_ratio": 0.8}]},
    {
        "cls": KVzipPress,
        "kwargs": [{"compression_ratio": 0.5, "layerwise": False}, {"compression_ratio": 0.8, "layerwise": True}],
    },
    {"cls": TestFastKVzipPress, "kwargs": [{"compression_ratio": 0.2}, {"compression_ratio": 0.8}]},
    {"cls": CURPress, "kwargs": [{"compression_ratio": 0.2}, {"compression_ratio": 0.8}]},
    {"cls": TestKVzapPress, "kwargs": [{"compression_ratio": 0.2}, {"compression_ratio": 0.8}]},
    {
        "cls": CompactorPress,
        "kwargs": [
            {
                "compression_ratio": 0.5,
                "sink_size_start": 1,
                "sink_size_end": 1,
                "chunk_size": 256,
            },
            {"compression_ratio": 0.8, "sink_size_start": 0, "sink_size_end": 0, "chunk_size": 256},
        ],
    },
    {
        "cls": LeverageScorePress,
        "kwargs": [
            {"compression_ratio": 0.8, "sketch_dimension": 48},
        ],
    },
    {
        "cls": NonCausalAttnPress,
        "kwargs": [
            {
                "compression_ratio": 0.5,
                "chunk_size": 256,
            },
        ],
    },
]