test_compressed_tensors.py 13.3 KB
Newer Older
1
"""Test model set-up and weight loading for llmcompressor-quantized models.
2
3
4

Run `pytest tests/quantization/test_compressed_tensors.py`.
"""
5
from typing import Optional
6

7
import pytest
8
import torch
9
from compressed_tensors.quantization import QuantizationType
10

11
from tests.models.utils import check_logprobs_close
12
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
13
14
15
16
    CompressedTensors24, CompressedTensorsLinearMethod,
    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
    CompressedTensorsWNA16)
17
18
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
    sparse_cutlass_supported)
19
from vllm.platforms import current_platform
20
21


22
23
24
25
26
27
28
29
@pytest.mark.parametrize(
    "model_args",
    [("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", "tensor",
      QuantizationType.INT, 2560, True),
     ("nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", "channel",
      QuantizationType.INT, 2560, True),
     ("nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama", "tensor",
      QuantizationType.INT, 2560, False)])
30
def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
31
    model_path, strategy, quant_type, shape_0, is_symmetric = model_args
32
    with vllm_runner(model_path, enforce_eager=True) as llm:
33
34
        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
        layer = model.model.layers[0]
35

36
37
38
39
        qkv_proj = layer.self_attn.qkv_proj
        o_proj = layer.self_attn.o_proj
        gate_up_proj = layer.mlp.gate_up_proj
        down_proj = layer.mlp.down_proj
40

41
42
43
44
45
46
47
48
49
50
51
52
        # assert zp for symmetric and asymmetric cases
        def zp_valid(zp: Optional[torch.Tensor]):
            if is_symmetric:
                return zp is None

            return zp is not None and zp.dtype is torch.int32

        assert zp_valid(qkv_proj.input_zero_point)
        assert zp_valid(o_proj.input_zero_point)
        assert zp_valid(gate_up_proj.input_zero_point)
        assert zp_valid(down_proj.input_zero_point)

53
54
55
56
57
58
        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
        assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
        assert isinstance(gate_up_proj.quant_method,
                          CompressedTensorsLinearMethod)
        assert isinstance(down_proj.quant_method,
                          CompressedTensorsLinearMethod)
59
        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
60

61
        assert qkv_proj.scheme.strategy == strategy
62
        assert qkv_proj.scheme.is_static_input_scheme
63
        expected_type = torch.int8
64
65
66
67

        assert qkv_proj.weight.dtype is expected_type
        assert o_proj.weight.dtype is expected_type
        assert gate_up_proj.weight.dtype is expected_type
68

69
        if qkv_proj.scheme.strategy == "tensor":
70
71
72
73
74
75
            # Make sure it is a channelwise buffer
            # After running process_weights_after_loading
            assert len(qkv_proj.weight_scale.shape) == 2
            assert qkv_proj.weight_scale.shape[0] == shape_0
            assert qkv_proj.weight_scale.shape[1] == 1
        assert qkv_proj.weight_scale.dtype is torch.float32
76
        assert qkv_proj.input_scale.dtype is torch.float32
77

78
        output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
79
80
        assert output

81

82
83
84
85
86
87
@pytest.mark.parametrize("model_path", [
    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
])
88
89
90
91
92
93
94
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [10])
def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner,
                                          example_prompts, model_path,
                                          max_tokens, num_logprobs):
    dtype = "bfloat16"

95
96
97
98
    # skip language translation prompt for the static per tensor asym model
    if model_path == "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym":  # noqa: E501
        example_prompts = example_prompts[0:-1]

99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
    with hf_runner(model_path, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs)

    with vllm_runner(model_path, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

    check_logprobs_close(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )


115
def test_compressed_tensors_no_enforce_eager(vllm_runner):
116
    model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
117
    with vllm_runner(model_path) as llm:
118
        output = llm.generate_greedy("Hello my name is", max_tokens=20)
119
120
121
        assert output


122
123
@pytest.mark.parametrize("model_args", [
    ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"),
124
    ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", "tensor"),
125
    ("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", "channel"),
126
127
    ("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
     "channel"),
128
])
129
def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
130
    model_path, strategy = model_args
131
    with vllm_runner(model_path, dtype=torch.float16) as llm:
132
133
134
135
136
137
        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
        layer = model.model.layers[0]

        qkv_proj = layer.self_attn.qkv_proj

        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
138
        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
139
        assert not qkv_proj.scheme.is_static_input_scheme
140
        assert qkv_proj.scheme.strategy == strategy
141
        assert qkv_proj.weight.dtype is torch.int8
142

143
        output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
144
145
        assert output

146

147
148
149
150
151
@pytest.mark.parametrize(
    "wNa16_args",
    [("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8),
     ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8),
     ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4)])
152
def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
153
    model, strategy, group, pack_factor = wNa16_args
154
155
156
157
158
159
    with vllm_runner(model) as llm:
        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
        layer = model.model.layers[0]

        qkv_proj = layer.self_attn.qkv_proj
        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
160
        assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16)
161
162

        assert qkv_proj.scheme.strategy == strategy
163
        assert qkv_proj.scheme.group_size == (-1 if group is None else group)
164
165
166

        assert qkv_proj.weight_packed.dtype is torch.int32
        assert qkv_proj.weight_scale.dtype is torch.float16
167
        assert qkv_proj.scheme.pack_factor == pack_factor
168

169
170
171
        output = llm.generate_greedy("Hello my name is", max_tokens=20)
        assert output

172
173
174
175
176
177
178
179
180
181
182
183
184

def test_compressed_tensors_w4a16_marlin24(vllm_runner):
    model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
    with vllm_runner(model_path) as llm:
        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
        layer = model.model.layers[0]

        qkv_proj = layer.self_attn.qkv_proj

        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
        assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24)
        assert qkv_proj.weight_packed.dtype is torch.int32

185
        output = llm.generate_greedy("Hello my name is", max_tokens=20)
186
        assert output
187
188
189
190
191
192
193
194
195
196
197


def test_compressed_tensors_fp8(vllm_runner):
    model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
    with vllm_runner(model_path) as llm:
        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
        layer = model.model.layers[0]

        qkv_proj = layer.self_attn.qkv_proj

        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
198
199
200
201
        assert isinstance(
            qkv_proj.scheme,
            (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8))

202
        assert qkv_proj.input_scale.dtype is torch.float32
203
204
205
206
207
208

        if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8):
            assert len(qkv_proj.input_scale.shape) == 0
            assert qkv_proj.weight.dtype is torch.float8_e4m3fn
            assert qkv_proj.weight_scale.dtype is torch.float32
            assert len(qkv_proj.weight_scale.shape) == 0
209

210
        output = llm.generate_greedy("Hello my name is", max_tokens=20)
211
        assert output
212
213
214
215
216
217


def test_compressed_tensors_kv_cache(vllm_runner):
    model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
    with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
        output = llm.generate_greedy("Hello world!", max_tokens=20)
218
        assert output
219
220


221
@pytest.mark.skipif(not sparse_cutlass_supported(),
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
                    reason="Sparse FP8 is not yet supported on this GPU type.")
def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy):
    assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
    assert isinstance(qkv_proj.scheme, CompressedTensors24)

    assert qkv_proj.scheme.weight_quant.strategy == weight_strategy
    assert qkv_proj.scheme.input_quant.strategy == input_strategy
    assert qkv_proj.scheme.quantized
    assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
    sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
    assert sparsity_map.get("Linear").format == "dense"
    assert sparsity_map.get("Linear").sparsity_structure == "2:4"


@pytest.mark.skipif(not current_platform.has_device_capability(90),
                    reason="Sparse FP8 is not yet supported on this GPU type.")
@pytest.mark.parametrize("args_2of4", [
    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing", "channel",
     "token"),
    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
     "channel", "tensor"),
    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing", "tensor",
     "tensor"),
    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
     "tensor", "token"),
])
def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
    model, weight_strategy, input_strategy = args_2of4
    with vllm_runner(model) as llm:
        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
        layer = model.model.layers[0]

        qkv_proj = layer.self_attn.qkv_proj
        assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn
        _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)

        output = llm.generate_greedy("Hello my name is", max_tokens=20)
        print(output)
        assert output


263
@pytest.mark.skipif(not sparse_cutlass_supported(),
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
                    reason="Sparse FP8 is not yet supported on this GPU type.")
@pytest.mark.parametrize("args_2of4", [
    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
     "channel", "token"),
    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing", "tensor",
     "tensor"),
    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
     "tensor", "token"),
])
def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
    model, weight_strategy, input_strategy = args_2of4
    with vllm_runner(model) as llm:
        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
        layer = model.model.layers[0]

        qkv_proj = layer.self_attn.qkv_proj
        assert qkv_proj.scheme.weights_dtype == torch.int8
        _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)

        output = llm.generate_greedy("Hello my name is", max_tokens=20)
        print(output)
        assert output


288
@pytest.mark.skipif(not sparse_cutlass_supported(),
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
                    reason="Sparse FP8 is not yet supported on this GPU type.")
@pytest.mark.parametrize(
    "args_2of4",
    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])
def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
    model = args_2of4
    with vllm_runner(model) as llm:
        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
        layer = model.model.layers[0]

        qkv_proj = layer.self_attn.qkv_proj
        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
        assert isinstance(qkv_proj.scheme, CompressedTensors24)

        assert qkv_proj.scheme.weight_quant is None
        assert qkv_proj.scheme.input_quant is None
        assert not qkv_proj.scheme.quantized
        assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
        sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
        assert sparsity_map.get("Linear").format == "dense"
        assert sparsity_map.get("Linear").sparsity_structure == "2:4"

        output = llm.generate_greedy("Hello my name is", max_tokens=20)
        print(output)
        assert output