fp_quant.py 974 Bytes
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Test model set-up and inference for quantized HF models supported
on the GPU backend using FPQuant.

Validating the configuration and printing results for manual checking.

Run `pytest tests/quantization/test_fp_quant.py`.
"""

import pytest

from tests.quantization.utils import is_quant_method_supported

MODELS = [
    "ISTA-DASLab/Qwen3-0.6B-RTN-NVFP4",
    "ISTA-DASLab/Qwen3-0.6B-RTN-MXFP4",
]
DTYPE = ["bfloat16"]
EAGER = [True, False]


@pytest.mark.skipif(
    not is_quant_method_supported("fp_quant"),
    reason="FPQuant is not supported on this GPU type.",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("eager", EAGER)
def test_fpquant(vllm_runner, model, eager):
    with vllm_runner(model, enforce_eager=eager) as llm:
        output = llm.generate_greedy(["1 2 3 4 5"], max_tokens=2)
    assert output[0][1] == "1 2 3 4 5 6"