test_gptq_dynamic.py 3.44 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
"""Tests whether gptq models with dynamic quantized can be loaded.

Run `pytest tests/quantization/test_gptq_dynamic.py --forked`.
"""

import pytest
import torch

from vllm.model_executor.layers.linear import UnquantizedLinearMethod
from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
from vllm.model_executor.layers.quantization.gptq_marlin import (
    GPTQMarlinLinearMethod)
from vllm.model_executor.layers.quantization.utils.gptq_utils import (
    get_dynamic_override)

PROMPT = "On the surface of Mars, we found"

# The first layer is quantized using bits=4, group_size=128
# The second layer is quantized using bits=8, group_size=32
# All other layers (layer index >= 2) are not quantized
MODEL_QUANT = [
    ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
     True),
    ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
     False),
]


@pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT)
32
33
def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool,
                           monkeypatch):
34
35
    # `LLM.apply_model` requires pickling a function.
    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
36
37
38
39

    linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else (
        GPTQLinearMethod)

40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
    with vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) as llm:

        def check_model(model):
            for name, submodule in model.named_modules():
                if name == "lm_head":
                    assert isinstance(submodule.quant_method,
                                      linear_method_cls)
                elif name == 'model.layers.0.self_attn.qkv_proj':
                    # The first layer is quantized using bits=4, group_size=128
                    # desc_act=True
                    assert isinstance(submodule.quant_method,
                                      linear_method_cls)
                    config = submodule.quant_method.quant_config
                    assert config.weight_bits == 4
                    assert config.group_size == 128
                    assert config.desc_act
                elif name == 'model.layers.1.self_attn.qkv_proj':
                    # The second layer is quantized using bits=8, group_size=32
                    # desc_act=False
                    assert isinstance(submodule.quant_method,
                                      linear_method_cls)
                    config = submodule.quant_method.quant_config
                    assert get_dynamic_override(config,
                                                layer_name=name,
                                                key="bits") == 8
                    assert get_dynamic_override(config,
                                                layer_name=name,
                                                key="group_size") == 32
                    assert not get_dynamic_override(
                        config, layer_name=name, key="desc_act")
                elif (name == 'model.layers.2.self_attn.qkv_proj'
                      or name == 'model.layers.2.mlp.gate_up_proj'):
                    # All other layers (layer index >= 2) are not quantized
                    assert isinstance(submodule.quant_method,
                                      UnquantizedLinearMethod)
75

76
        llm.apply_model(check_model)