test_compressed_tensors.py 12.9 KB
Newer Older
1
"""Test model set-up and weight loading for llmcompressor-quantized models.
2
3
4

Run `pytest tests/quantization/test_compressed_tensors.py`.
"""
5
from typing import Optional
6

7
import pytest
8
import torch
9
from compressed_tensors.quantization import QuantizationType
10

11
from tests.models.utils import check_logprobs_close
12
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
13
14
15
16
17
    CompressedTensors24, CompressedTensorsLinearMethod,
    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
    CompressedTensorsWNA16)
from vllm.platforms import current_platform
18
19


20
21
22
23
24
25
26
27
@pytest.mark.parametrize(
    "model_args",
    [("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", "tensor",
      QuantizationType.INT, 2560, True),
     ("nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", "channel",
      QuantizationType.INT, 2560, True),
     ("nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama", "tensor",
      QuantizationType.INT, 2560, False)])
28
def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
29
    model_path, strategy, quant_type, shape_0, is_symmetric = model_args
30
    with vllm_runner(model_path, enforce_eager=True) as llm:
31
32
        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
        layer = model.model.layers[0]
33

34
35
36
37
        qkv_proj = layer.self_attn.qkv_proj
        o_proj = layer.self_attn.o_proj
        gate_up_proj = layer.mlp.gate_up_proj
        down_proj = layer.mlp.down_proj
38

39
40
41
42
43
44
45
46
47
48
49
50
        # assert zp for symmetric and asymmetric cases
        def zp_valid(zp: Optional[torch.Tensor]):
            if is_symmetric:
                return zp is None

            return zp is not None and zp.dtype is torch.int32

        assert zp_valid(qkv_proj.input_zero_point)
        assert zp_valid(o_proj.input_zero_point)
        assert zp_valid(gate_up_proj.input_zero_point)
        assert zp_valid(down_proj.input_zero_point)

51
52
53
54
55
56
        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
        assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
        assert isinstance(gate_up_proj.quant_method,
                          CompressedTensorsLinearMethod)
        assert isinstance(down_proj.quant_method,
                          CompressedTensorsLinearMethod)
57
        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
58

59
        assert qkv_proj.scheme.strategy == strategy
60
        assert qkv_proj.scheme.is_static_input_scheme
61
        expected_type = torch.int8
62
63
64
65

        assert qkv_proj.weight.dtype is expected_type
        assert o_proj.weight.dtype is expected_type
        assert gate_up_proj.weight.dtype is expected_type
66

67
        if qkv_proj.scheme.strategy == "tensor":
68
69
70
71
72
73
            # Make sure it is a channelwise buffer
            # After running process_weights_after_loading
            assert len(qkv_proj.weight_scale.shape) == 2
            assert qkv_proj.weight_scale.shape[0] == shape_0
            assert qkv_proj.weight_scale.shape[1] == 1
        assert qkv_proj.weight_scale.dtype is torch.float32
74
        assert qkv_proj.input_scale.dtype is torch.float32
75

76
        output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
77
78
        assert output

79

80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
@pytest.mark.parametrize(
    "model_path",
    [
        "neuralmagic/Llama-3.2-1B-quantized.w8a8"
        # TODO static & asymmetric
    ])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [10])
def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner,
                                          example_prompts, model_path,
                                          max_tokens, num_logprobs):
    dtype = "bfloat16"

    with hf_runner(model_path, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs)

    with vllm_runner(model_path, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

    check_logprobs_close(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )


109
def test_compressed_tensors_no_enforce_eager(vllm_runner):
110
    model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
111
    with vllm_runner(model_path) as llm:
112
        output = llm.generate_greedy("Hello my name is", max_tokens=20)
113
114
115
        assert output


116
117
@pytest.mark.parametrize("model_args", [
    ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"),
118
    ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", "tensor"),
119
    ("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", "channel"),
120
121
    ("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
     "channel"),
122
])
123
def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
124
    model_path, strategy = model_args
125
    with vllm_runner(model_path, dtype=torch.float16) as llm:
126
127
128
129
130
131
        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
        layer = model.model.layers[0]

        qkv_proj = layer.self_attn.qkv_proj

        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
132
        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
133
        assert not qkv_proj.scheme.is_static_input_scheme
134
        assert qkv_proj.scheme.strategy == strategy
135
        assert qkv_proj.weight.dtype is torch.int8
136

137
        output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
138
139
        assert output

140

141
142
143
144
145
@pytest.mark.parametrize(
    "wNa16_args",
    [("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8),
     ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8),
     ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4)])
146
def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
147
    model, strategy, group, pack_factor = wNa16_args
148
149
150
151
152
153
    with vllm_runner(model) as llm:
        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
        layer = model.model.layers[0]

        qkv_proj = layer.self_attn.qkv_proj
        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
154
        assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16)
155
156

        assert qkv_proj.scheme.strategy == strategy
157
        assert qkv_proj.scheme.group_size == (-1 if group is None else group)
158
159
160

        assert qkv_proj.weight_packed.dtype is torch.int32
        assert qkv_proj.weight_scale.dtype is torch.float16
161
        assert qkv_proj.scheme.pack_factor == pack_factor
162

163
164
165
        output = llm.generate_greedy("Hello my name is", max_tokens=20)
        assert output

166
167
168
169
170
171
172
173
174
175
176
177
178

def test_compressed_tensors_w4a16_marlin24(vllm_runner):
    model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
    with vllm_runner(model_path) as llm:
        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
        layer = model.model.layers[0]

        qkv_proj = layer.self_attn.qkv_proj

        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
        assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24)
        assert qkv_proj.weight_packed.dtype is torch.int32

179
        output = llm.generate_greedy("Hello my name is", max_tokens=20)
180
        assert output
181
182
183
184
185
186
187
188
189
190
191


def test_compressed_tensors_fp8(vllm_runner):
    model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
    with vllm_runner(model_path) as llm:
        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
        layer = model.model.layers[0]

        qkv_proj = layer.self_attn.qkv_proj

        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
192
193
194
195
        assert isinstance(
            qkv_proj.scheme,
            (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8))

196
        assert qkv_proj.input_scale.dtype is torch.float32
197
198
199
200
201
202

        if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8):
            assert len(qkv_proj.input_scale.shape) == 0
            assert qkv_proj.weight.dtype is torch.float8_e4m3fn
            assert qkv_proj.weight_scale.dtype is torch.float32
            assert len(qkv_proj.weight_scale.shape) == 0
203

204
        output = llm.generate_greedy("Hello my name is", max_tokens=20)
205
        assert output
206
207
208
209
210
211


def test_compressed_tensors_kv_cache(vllm_runner):
    model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
    with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
        output = llm.generate_greedy("Hello world!", max_tokens=20)
212
        assert output
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307


@pytest.mark.skipif(not current_platform.has_device_capability(90),
                    reason="Sparse FP8 is not yet supported on this GPU type.")
def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy):
    assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
    assert isinstance(qkv_proj.scheme, CompressedTensors24)

    assert qkv_proj.scheme.weight_quant.strategy == weight_strategy
    assert qkv_proj.scheme.input_quant.strategy == input_strategy
    assert qkv_proj.scheme.quantized
    assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
    sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
    assert sparsity_map.get("Linear").format == "dense"
    assert sparsity_map.get("Linear").sparsity_structure == "2:4"


@pytest.mark.skipif(not current_platform.has_device_capability(90),
                    reason="Sparse FP8 is not yet supported on this GPU type.")
@pytest.mark.parametrize("args_2of4", [
    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing", "channel",
     "token"),
    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
     "channel", "tensor"),
    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing", "tensor",
     "tensor"),
    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
     "tensor", "token"),
])
def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
    model, weight_strategy, input_strategy = args_2of4
    with vllm_runner(model) as llm:
        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
        layer = model.model.layers[0]

        qkv_proj = layer.self_attn.qkv_proj
        assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn
        _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)

        output = llm.generate_greedy("Hello my name is", max_tokens=20)
        print(output)
        assert output


@pytest.mark.skipif(not current_platform.has_device_capability(90),
                    reason="Sparse FP8 is not yet supported on this GPU type.")
@pytest.mark.parametrize("args_2of4", [
    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
     "channel", "token"),
    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing", "tensor",
     "tensor"),
    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
     "tensor", "token"),
])
def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
    model, weight_strategy, input_strategy = args_2of4
    with vllm_runner(model) as llm:
        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
        layer = model.model.layers[0]

        qkv_proj = layer.self_attn.qkv_proj
        assert qkv_proj.scheme.weights_dtype == torch.int8
        _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)

        output = llm.generate_greedy("Hello my name is", max_tokens=20)
        print(output)
        assert output


@pytest.mark.skipif(not current_platform.has_device_capability(90),
                    reason="Sparse FP8 is not yet supported on this GPU type.")
@pytest.mark.parametrize(
    "args_2of4",
    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])
def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
    model = args_2of4
    with vllm_runner(model) as llm:
        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
        layer = model.model.layers[0]

        qkv_proj = layer.self_attn.qkv_proj
        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
        assert isinstance(qkv_proj.scheme, CompressedTensors24)

        assert qkv_proj.scheme.weight_quant is None
        assert qkv_proj.scheme.input_quant is None
        assert not qkv_proj.scheme.quantized
        assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
        sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
        assert sparsity_map.get("Linear").format == "dense"
        assert sparsity_map.get("Linear").sparsity_structure == "2:4"

        output = llm.generate_greedy("Hello my name is", max_tokens=20)
        print(output)
        assert output