test_compressed_tensors.py 19.5 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
"""Test model set-up and weight loading for llmcompressor-quantized models.
4
5
6

Run `pytest tests/quantization/test_compressed_tensors.py`.
"""
7

8
import pytest
9
import torch
10
from compressed_tensors.quantization import QuantizationType
11

12
from tests.models.utils import check_logprobs_close
13
from vllm.model_executor.layers.fused_moe import UnquantizedFusedMoEMethod
14
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
15
16
17
18
19
20
21
22
23
    CompressedTensorsLinearMethod,
    CompressedTensorsW4A4Fp4,
    CompressedTensorsW4A8Fp8,
    CompressedTensorsW4A16Fp4,
    CompressedTensorsW8A8Fp8,
    CompressedTensorsW8A8Int8,
    CompressedTensorsW8A16Fp8,
    CompressedTensorsWNA16,
)
24
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
25
from vllm.model_executor.layers.quantization.utils.fp8_utils import W8A8BlockFp8LinearOp
26
from vllm.model_executor.layers.quantization.utils.nvfp4_utils import (
27
28
    cutlass_fp4_supported,
)
29
from vllm.platforms import current_platform
30
from vllm.v1.attention.backends.fa_utils import get_flash_attn_version
31

32
33
34
35
36
# AITER only supports per-channel-per-channel INT8 gemm
# and per-tensor-per-tensor INT8 GEMM.
# It does not support mix precision MM and mix quantization scheme.
ROCM_AITER_SUPPORTED_INT8_MODEL = [
    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
37
    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
38
39
]

40
# TritonInt8ScaledMMLinearKernel only supports symmetric quantization.
41
42
43
44
45
46
47
48
ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL = [
    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
    "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
]

49

50
@pytest.fixture(scope="function", autouse=True)
51
52
53
def enable_pickle(monkeypatch):
    """`LLM.apply_model` requires pickling a function."""
    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
54
55


56
57
@pytest.mark.parametrize(
    "model_args",
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
    [
        (
            "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
            "tensor",
            QuantizationType.INT,
            2560,
            True,
        ),
        (
            "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
            "tensor",
            QuantizationType.INT,
            2560,
            False,
        ),
    ],
)
75
def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
76
    model_path, strategy, quant_type, shape_0, is_symmetric = model_args
77

78
79
80
81
    if (
        current_platform.is_rocm()
        and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
    ):
82
        pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")
83

84
    with vllm_runner(model_path, enforce_eager=True) as llm:
85
86
87
88
89
90
91
92
93
94

        def check_model(model):
            layer = model.model.layers[0]

            qkv_proj = layer.self_attn.qkv_proj
            o_proj = layer.self_attn.o_proj
            gate_up_proj = layer.mlp.gate_up_proj
            down_proj = layer.mlp.down_proj

            # assert zp for symmetric and asymmetric cases
95
            def zp_valid(zp: torch.Tensor | None):
96
97
98
99
100
101
102
103
104
105
                if is_symmetric:
                    return zp is None

                return zp is not None and zp.dtype is torch.int32

            assert zp_valid(qkv_proj.input_zero_point)
            assert zp_valid(o_proj.input_zero_point)
            assert zp_valid(gate_up_proj.input_zero_point)
            assert zp_valid(down_proj.input_zero_point)

106
107
108
109
            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
            assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
            assert isinstance(gate_up_proj.quant_method, CompressedTensorsLinearMethod)
            assert isinstance(down_proj.quant_method, CompressedTensorsLinearMethod)
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
            assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)

            assert qkv_proj.scheme.strategy == strategy
            assert qkv_proj.scheme.is_static_input_scheme
            expected_type = torch.int8

            assert qkv_proj.weight.dtype is expected_type
            assert o_proj.weight.dtype is expected_type
            assert gate_up_proj.weight.dtype is expected_type

            if qkv_proj.scheme.strategy == "tensor":
                # Make sure it is a channelwise buffer
                # After running process_weights_after_loading
                assert len(qkv_proj.weight_scale.shape) == 2
                assert qkv_proj.weight_scale.shape[0] == shape_0
                assert qkv_proj.weight_scale.shape[1] == 1
            assert qkv_proj.weight_scale.dtype is torch.float32
            assert qkv_proj.input_scale.dtype is torch.float32

        llm.apply_model(check_model)
130

131
        output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
132
133
        assert output

134

135
136
137
138
139
140
@pytest.mark.parametrize(
    "model_path",
    [
        "neuralmagic/Llama-3.2-1B-quantized.w8a8",
    ],
)
141
@pytest.mark.parametrize("max_tokens", [4])
142
@pytest.mark.parametrize("num_logprobs", [10])
143
@pytest.mark.parametrize(
144
145
    "use_aiter", [True, False] if current_platform.is_rocm() else [False]
)
146
147
148
149
150
151
152
def test_compressed_tensors_w8a8_logprobs(
    hf_runner,
    vllm_runner,
    example_prompts,
    model_path,
    max_tokens,
    num_logprobs,
153
154
    use_aiter,
    monkeypatch,
155
):
156
157
158
159
    if (
        current_platform.is_rocm()
        and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
    ):
160
        pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")
161
162
163

    if use_aiter:
        if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
164
            pytest.skip(f"Skip model {model_path} as it is not support by aiter.")
165
166
167
        # this will enable VLLM_ROCM_USE_AITER_LINEAR
        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")

168
169
    dtype = "bfloat16"

170
171
    # skip language translation prompt for the static per tensor models
    if model_path in (
172
173
        "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
        "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
174
    ):
175
176
        example_prompts = example_prompts[0:-1]

177
178
    with hf_runner(model_path, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
179
180
            example_prompts, max_tokens, num_logprobs
        )
181

182
    with vllm_runner(model_path, dtype=dtype, enforce_eager=True) as vllm_model:
183
        vllm_outputs = vllm_model.generate_greedy_logprobs(
184
185
            example_prompts, max_tokens, num_logprobs
        )
186
187
188
189
190
191
192
193

    check_logprobs_close(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )

194
    if current_platform.is_rocm():
195
        torch.accelerator.synchronize()
196

197

198
def test_compressed_tensors_no_enforce_eager(vllm_runner):
199
    model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
200
    with vllm_runner(model_path) as llm:
201
        output = llm.generate_greedy("Hello my name is", max_tokens=4)
202
203
204
        assert output


205
206
207
208
209
210
211
212
213
214
@pytest.mark.parametrize(
    "model_args",
    [
        ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"),
        (
            "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
            "channel",
        ),
    ],
)
215
@pytest.mark.parametrize(
216
217
    "use_aiter", [True, False] if current_platform.is_rocm() else [False]
)
218
219
220
221
222
223
def test_compressed_tensors_w8a8_dynamic_per_token(
    vllm_runner,
    model_args,
    use_aiter,
    monkeypatch,
):
224
    model_path, strategy = model_args
225

226
227
228
229
    if (
        current_platform.is_rocm()
        and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
    ):
230
        pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")
231
232
233

    if use_aiter:
        if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
234
            pytest.skip(f"Skip model {model_path} as it is not support by aiter.")
235
236
237
        # this will enable VLLM_ROCM_USE_AITER_LINEAR
        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")

238
    with vllm_runner(model_path, enforce_eager=True, dtype=torch.float16) as llm:
239

240
241
242
243
244
        def check_model(model):
            layer = model.model.layers[0]

            qkv_proj = layer.self_attn.qkv_proj

245
            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
246
247
248
249
            assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
            assert not qkv_proj.scheme.is_static_input_scheme
            assert qkv_proj.scheme.strategy == strategy
            assert qkv_proj.weight.dtype is torch.int8
250

251
        llm.apply_model(check_model)
252

253
        output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
254
255
        assert output

256

257
258
@pytest.mark.parametrize(
    "wNa16_args",
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
    [
        (
            "nm-testing/tinyllama-oneshot-w4a16-channel-v2",
            "channel",
            None,
            8,
            True,
            False,
        ),
        (
            "nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-ActOrder",
            "group",
            128,
            8,
            False,
            True,
        ),
    ],
)
@pytest.mark.skipif(
    not current_platform.is_cuda(), reason="The tests are skipped on non-CUDA platform."
280
)
281
def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
282
    model, strategy, group, pack_factor, symmetric, has_g_idx = wNa16_args
283
    with vllm_runner(model, enforce_eager=True) as llm:
284

285
286
        def check_model(model):
            layer = model.model.layers[0]
287

288
            qkv_proj = layer.self_attn.qkv_proj
289
            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
290
            assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16)
291

292
            assert qkv_proj.scheme.strategy == strategy
293
            assert qkv_proj.scheme.group_size == (-1 if group is None else group)
294
295

            assert qkv_proj.scheme.pack_factor == pack_factor
296
297
            assert qkv_proj.scheme.symmetric == symmetric
            assert qkv_proj.scheme.has_g_idx == has_g_idx
298
299

        llm.apply_model(check_model)
300

301
        output = llm.generate_greedy("Hello my name is", max_tokens=4)
302
303
        assert output

304

305
306
def test_compressed_tensors_fp8(vllm_runner):
    model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
307
    with vllm_runner(model_path, enforce_eager=True) as llm:
308

309
310
        def check_model(model):
            layer = model.model.layers[0]
311

312
            qkv_proj = layer.self_attn.qkv_proj
313

314
            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
315
316
            assert isinstance(
                qkv_proj.scheme,
317
318
                (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8),
            )
319

320
321
322
323
            assert qkv_proj.input_scale.dtype is torch.float32

            if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8):
                assert len(qkv_proj.input_scale.shape) == 0
324
                assert qkv_proj.weight.dtype is current_platform.fp8_dtype()
325
326
327
328
                assert qkv_proj.weight_scale.dtype is torch.float32
                assert len(qkv_proj.weight_scale.shape) == 0

        llm.apply_model(check_model)
329

330
        output = llm.generate_greedy("Hello my name is", max_tokens=4)
331
        assert output
332
333


334
335
336
@pytest.mark.skipif(
    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
)
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
def test_compressed_tensors_kv_cache_fp8_per_tensor(vllm_runner):
    model_path = "nm-testing/TinyLlama-1.1B-Chat-v1.0-kvcache-fp8-tensor"
    with vllm_runner(model_path) as llm:
        output = llm.generate_greedy("Hello world!", max_tokens=4)
        assert output


@pytest.mark.skipif(
    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
)
def test_compressed_tensors_kv_cache_fp8_per_attn_head(vllm_runner):
    model_path = "nm-testing/TinyLlama-1.1B-Chat-v1.0-kvcache-fp8-attn_head"
    try:
        fa_version = get_flash_attn_version()
    except Exception:
        pytest.skip("This test requires FlashAttention backend.")
    if fa_version is None or fa_version < 3:
        pytest.skip("This test requires FlashAttention version >= 3.")

    with vllm_runner(model_path, attention_config={"backend": "FLASH_ATTN"}) as llm:
357
        output = llm.generate_greedy("Hello world!", max_tokens=4)
358
        assert output
359
360


361
362
363
@pytest.mark.skipif(
    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
)
364
365
@pytest.mark.parametrize(
    "args",
366
    [
367
368
        # TODO: Enable once model is available again
        # ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16", CompressedTensorsW4A16Fp4),
369
370
371
        ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4),
    ],
)
372
373
def test_compressed_tensors_nvfp4(vllm_runner, args):
    model, scheme = args
374
375
376
377
378
379
    with vllm_runner(model, enforce_eager=True) as llm:

        def check_model(model):
            layer = model.model.layers[0]

            qkv_proj = layer.self_attn.qkv_proj
380
381
382
383
384
385
            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
            if (
                isinstance(qkv_proj.scheme, scheme)
                or isinstance(qkv_proj.scheme, CompressedTensorsW4A16Fp4)
                and not cutlass_fp4_supported()
            ):
386
387
388
389
                assert True
            else:
                raise AssertionError("FP4 Scheme Mismatch")

390
391
392
            assert qkv_proj.scheme.group_size == 16

        llm.apply_model(check_model)
393
        output = llm.generate_greedy("Hello my name is", max_tokens=4)
394
395
        print(output)
        assert output
396
397
398


@pytest.mark.skipif(
399
    not current_platform.is_cuda() or not current_platform.has_device_capability(90),
400
401
    reason="W4A8 FP8 is not yet supported on this GPU type.",
)
402
403
404
405
@pytest.mark.parametrize(
    "args",
    [("czhu-cohere/TinyLlama-1.1B-Chat-v1.0-W4A8-e2e", CompressedTensorsW4A8Fp8)],
)
406
407
408
409
410
411
412
413
414
415
416
417
418
def test_compressed_tensors_w4a8_fp8(vllm_runner, args):
    model, scheme = args
    with vllm_runner(model, enforce_eager=True) as llm:

        def check_model(model):
            layer = model.model.layers[0]

            qkv_proj = layer.self_attn.qkv_proj
            o_proj = layer.self_attn.o_proj
            gate_up_proj = layer.mlp.gate_up_proj
            down_proj = layer.mlp.down_proj

            for proj in (qkv_proj, o_proj, gate_up_proj, down_proj):
419
                assert isinstance(proj.quant_method, CompressedTensorsLinearMethod)
420
421
422
423
424
425
426
427
                assert isinstance(proj.scheme, scheme)

                assert proj.weight_packed.dtype is torch.int32
                assert proj.weight_scale.dtype is torch.float8_e4m3fn
                assert proj.weight_chan_scale.dtype is torch.float32
                assert proj.scheme.group_size == 128

        llm.apply_model(check_model)
428
        output = llm.generate_greedy("Hello my name is", max_tokens=4)
429
430
        print(output)
        assert output
431
432


433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
@pytest.mark.skipif(
    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
)
@pytest.mark.parametrize(
    "model,prompt,exp_perplexity",
    [
        (
            "nm-testing/Llama-3.2-1B-Instruct-spinquantR1R2R4-w4a16",
            "Flat is better than nested.\nSparse is better than dense.",
            150.0,
        ),
        (
            "nm-testing/Llama-3.2-1B-Instruct-quip-w4a16",
            "Flat is better than nested.\nSparse is better than dense.",
            150.0,
        ),
    ],
)
def test_compressed_tensors_transforms_perplexity(
    vllm_runner, model, prompt, exp_perplexity
):
454
455
456
    with vllm_runner(model, enforce_eager=True) as llm:
        perplexity = llm.generate_prompt_perplexity([prompt])[0]
        print(perplexity)
457
        assert perplexity <= exp_perplexity
458
459
460
461


def test_compressed_tensors_fp8_block_enabled(vllm_runner):
    model_path = "RedHatAI/Qwen3-0.6B-FP8-BLOCK"
462
    with vllm_runner(model_path, enforce_eager=True) as llm:
463
464
465
466
467
468
        fp8_dtype = current_platform.fp8_dtype()

        def check_model(model):
            layer = model.model.layers[0]

            qkv_proj = layer.self_attn.qkv_proj
469
            assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
470
            assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8)
471
472
473
            assert isinstance(
                qkv_proj.scheme.w8a8_block_fp8_linear, W8A8BlockFp8LinearOp
            )
474
475
476
477
478
479

            assert qkv_proj.weight.dtype is fp8_dtype
            assert qkv_proj.weight_scale.dtype is torch.float32
            assert len(qkv_proj.weight.shape) == 2
            assert len(qkv_proj.weight_scale.shape) == 2

480
            input_quant_op = qkv_proj.scheme.w8a8_block_fp8_linear.input_quant_op
481
            assert isinstance(input_quant_op, QuantFP8)
482
483
484
485
            assert input_quant_op._forward_method in (
                input_quant_op.forward_cuda,
                input_quant_op.forward_hip,
            )
486
487
488

        llm.apply_model(check_model)

489
        output = llm.generate_greedy("Hello my name is", max_tokens=4)
490
        assert output
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537


@pytest.mark.skipif(
    not current_platform.is_cuda(),
    reason="This test is not for non-CUDA platforms",
)
def test_compressed_tensors_moe_ignore_with_model(vllm_runner):
    """
    Integration test for MoE layer ignore functionality with a real model.

    This test would verify that when loading a compressed-tensors quantized
    MoE model where some MoE layers are in the ignore list, those layers
    use UnquantizedFusedMoEMethod while non-ignored layers use the
    quantized method.

    Expected model structure:
    - Compressed-tensors quantized MoE model (e.g., Mixtral-based)
    - Config with ignore list containing specific MoE layers
    - Multiple MoE layers where some are quantized and some are not
    """

    # model_path = "nm-testing/tinysmokeqwen3moe-W4A16-first-only" # CT 12.3
    model_path = "nm-testing/tinysmokeqwen3moe-W4A16-first-only-CTstable"  # CT 12.2

    with vllm_runner(model_path, enforce_eager=True) as llm:

        def check_model(model):
            from vllm.model_executor.layers.fused_moe import FusedMoE
            from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (  # noqa: E501
                CompressedTensorsMoEMethod,
            )

            # Check layer 0 MoE (should be quantized)
            layer_quantized = model.model.layers[0].mlp.experts
            assert isinstance(layer_quantized, FusedMoE)
            assert isinstance(layer_quantized.quant_method, CompressedTensorsMoEMethod)

            # Check layer 10 MoE (should be unquantized + ignored)
            layer_unquantized = model.model.layers[3].mlp.experts
            assert isinstance(layer_unquantized, FusedMoE)
            assert isinstance(layer_unquantized.quant_method, UnquantizedFusedMoEMethod)

        llm.apply_model(check_model)

        # Verify the model can generate output
        output = llm.generate_greedy("Hello, my name is", max_tokens=4)
        assert output
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560


def test_w4a16_moe_torch_compile(vllm_runner):
    """Regression test: MoE quant_config must be initialized inside the
    moe_forward custom op, not just in forward_native which is compiled by
    Dynamo (attribute mutations are not replayed at runtime).

    Without the fix in _moe_forward/_moe_forward_shared, this hits:
        AssertionError: Hidden size mismatch 2048 != 1024
    because use_int4_w4a16 is False (moe_quant_config stays None).
    """
    model_path = "nm-testing/tinysmokeqwen3moe-W4A16-first-only-CTstable"

    with vllm_runner(
        model_path,
        enforce_eager=False,
        max_model_len=256,
        compilation_config={
            "cudagraph_mode": "NONE",
        },
    ) as llm:
        output = llm.generate_greedy("Hi", max_tokens=1)
        assert output