test_eagle_quantization.py 5.28 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from unittest.mock import Mock, patch

import pytest
import torch

from vllm.config import LoadConfig, ModelConfig, SpeculativeConfig, VllmConfig
from vllm.model_executor.models.utils import get_draft_quant_config
from vllm.platforms import current_platform

DEVICES = (
    [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
    if current_platform.is_cuda_alike()
    else ["cpu"]
)


def test_get_draft_quant_config_with_draft_model():
    mock_draft_model_config = Mock(spec=ModelConfig)
    mock_load_config = Mock(spec=LoadConfig)
    mock_speculative_config = Mock(spec=SpeculativeConfig)
    mock_speculative_config.draft_model_config = mock_draft_model_config

    mock_vllm_config = Mock(spec=VllmConfig)
    mock_vllm_config.speculative_config = mock_speculative_config
    mock_vllm_config.load_config = mock_load_config

    mock_quant_config = Mock()
    with patch.object(
        VllmConfig, "get_quantization_config", return_value=mock_quant_config
    ):
        result = get_draft_quant_config(mock_vllm_config)

        # Verify the function calls get_quantization_config with draft model config
        VllmConfig.get_quantization_config.assert_called_once_with(
            mock_draft_model_config, mock_load_config
        )
        assert result == mock_quant_config


def test_get_draft_quant_config_without_draft_model():
    mock_speculative_config = Mock(spec=SpeculativeConfig)
    mock_speculative_config.draft_model_config = None

    mock_vllm_config = Mock(spec=VllmConfig)
    mock_vllm_config.speculative_config = mock_speculative_config
    mock_vllm_config.load_config = Mock(spec=LoadConfig)

    result = get_draft_quant_config(mock_vllm_config)

    assert result is None


@torch.inference_mode()
@pytest.mark.parametrize("device", DEVICES)
def test_fc_layer_quant_config_usage(dist_init, device) -> None:
    import torch

    from vllm.model_executor.layers.linear import ReplicatedLinear

    if current_platform.is_cuda_alike():
        torch.cuda.set_device(device)

    torch.set_default_device(device)

    input_size = 256
    output_size = 128

    fc_no_quant = ReplicatedLinear(
        input_size=input_size,
        output_size=output_size,
        bias=False,
        params_dtype=torch.float16,
        quant_config=None,
        prefix="fc",
    )

    assert fc_no_quant.quant_config is None
    assert fc_no_quant.input_size == input_size
    assert fc_no_quant.output_size == output_size

    mock_quant_config = Mock()
    fc_with_quant = ReplicatedLinear(
        input_size=input_size,
        output_size=output_size,
        bias=False,
        params_dtype=torch.float16,
        quant_config=mock_quant_config,
        prefix="fc",
    )

    assert fc_with_quant.quant_config == mock_quant_config

    # Check forward pass
    x = torch.randn(2, input_size, dtype=torch.float16)
    output, _ = fc_no_quant(x)
    assert output.shape == (2, output_size)


def test_kv_cache_scale_name_handling():
    # Mock a quant config that supports cache scales
    mock_quant_config = Mock()
    mock_quant_config.get_cache_scale = Mock(return_value="layers.0.self_attn.kv_scale")

    # Condition check in load_weights
    name = "layers.0.self_attn.k_proj.weight"
    scale_name = mock_quant_config.get_cache_scale(name)

    # Check if get_cache_scale is called and returns expected value
    mock_quant_config.get_cache_scale.assert_called_once_with(name)
    assert scale_name == "layers.0.self_attn.kv_scale"


def test_kv_cache_scale_name_no_scale():
    # Mock a quant config that returns None for get_cache_scale
    mock_quant_config = Mock()
    mock_quant_config.get_cache_scale = Mock(return_value=None)

    name = "layers.0.mlp.gate_proj.weight"
    scale_name = mock_quant_config.get_cache_scale(name)

    # Should return None for weights that don't have cache scales
    assert scale_name is None


def test_maybe_remap_kv_scale_name():
    from vllm.model_executor.model_loader.weight_utils import maybe_remap_kv_scale_name

    params_dict = {
        "layers.0.self_attn.kv_scale": Mock(),
        "layers.1.self_attn.kv_scale": Mock(),
    }

    name = "layers.0.self_attn.some_scale"
    remapped = maybe_remap_kv_scale_name(name, params_dict)

    assert remapped in params_dict or remapped == name or remapped is None


def test_load_weights_kv_scale_handling():
    kv_scale_param = Mock()
    kv_scale_param.weight_loader = Mock()

    params_dict = {
        "layers.0.self_attn.kv_scale": kv_scale_param,
    }

    mock_quant_config = Mock()
    mock_quant_config.get_cache_scale = Mock(return_value="layers.0.self_attn.kv_scale")

    # Load_weights logic for KV cache scales
    name = "layers.0.self_attn.k_proj.weight"
    loaded_weight_tensor = torch.tensor([1.0, 2.0])

    if mock_quant_config is not None:
        scale_name = mock_quant_config.get_cache_scale(name)
        if scale_name:
            param = params_dict[scale_name]
            assert param is kv_scale_param
            weight_to_load = (
                loaded_weight_tensor
                if loaded_weight_tensor.dim() == 0
                else loaded_weight_tensor[0]
            )

            assert scale_name == "layers.0.self_attn.kv_scale"
            assert weight_to_load == loaded_weight_tensor[0]