test_fp8.py 6.03 KB
Newer Older
1
2
3
4
5
6
"""Tests whether FP8 computation is enabled correctly.

Run `pytest tests/quantization/test_fp8.py --forked`.
"""
import pytest
import torch
7
import os
8

9
from tests.quantization.utils import is_quant_method_supported
10
from vllm import _custom_ops as ops
11
12
from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod,
                                                         Fp8LinearMethod)
13
from vllm.platforms import current_platform
14
from ..utils import models_path_prefix
zhuwenwen's avatar
zhuwenwen committed
15
from vllm.utils import is_hip
16

17
MODELS = [
18
19
20
    os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"),
    os.path.join(models_path_prefix, "nm-testing/Phi-3-mini-128k-instruct-FP8"),
    os.path.join(models_path_prefix, "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV"),
21
22
23
]


zhuwenwen's avatar
zhuwenwen committed
24
@pytest.mark.skipif(not is_quant_method_supported("fp8") or is_hip(),
25
                    reason="FP8 is not supported on this GPU type.")
26
@pytest.mark.parametrize("model_id", MODELS)
27
28
29
30
31
32
@pytest.mark.parametrize("force_marlin", [False, True])
def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
                            monkeypatch) -> None:
    if force_marlin:
        monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")

33
34
35
36
37
38
39
40
41
42
    with vllm_runner(model_id) as llm:
        # note: this does not test accuracy, just that we can run through
        # see lm-eval tests for accuracy
        outputs = llm.generate_greedy(prompts=["Hello my name is"],
                                      max_tokens=10)
        print(outputs[0][1])


KV_CACHE_MODELS = [
    # Deprecated AutoFP8 format using .kv_scale
43
    os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"),
44
    # AutoFP8 format using separate .k_scale and .v_scale
45
    os.path.join(models_path_prefix, "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V"),
46
47
48
]


zhuwenwen's avatar
zhuwenwen committed
49
@pytest.mark.skipif(not is_quant_method_supported("fp8") or is_hip(),
50
51
52
53
54
55
56
57
58
59
60
61
62
                    reason="FP8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
    with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:

        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
        attn = model.model.layers[0].self_attn.attn
        assert isinstance(attn.quant_method, Fp8KVCacheMethod)
        # NOTE: it is valid for scales to be 1.0 (default value), but we know
        # these checkpoints have scales < 1.0
        assert 0.0 < attn._k_scale < 1.0
        assert 0.0 < attn._v_scale < 1.0

63
64
65
66
67
68
        # note: this does not test accuracy, just that we can run through
        # see lm-eval tests for accuracy
        outputs = llm.generate_greedy(prompts=["Hello my name is"],
                                      max_tokens=10)
        print(outputs[0][1])

69

zhuwenwen's avatar
zhuwenwen committed
70
@pytest.mark.skipif(not is_quant_method_supported("fp8") or is_hip(),
71
                    reason="FP8 is not supported on this GPU type.")
72
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
73
74
75
76
77
78
@pytest.mark.parametrize("force_marlin", [False, True])
def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
                         monkeypatch) -> None:
    if force_marlin:
        monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")

79
    with vllm_runner(os.path.join(models_path_prefix, "facebook/opt-125m"),
80
81
                     quantization="fp8",
                     kv_cache_dtype=kv_cache_dtype) as llm:
82

83
84
85
        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
        fc1 = model.model.decoder.layers[0].fc1
        assert isinstance(fc1.quant_method, Fp8LinearMethod)
86
87
88
89
90
        if kv_cache_dtype == "fp8":
            attn = model.model.decoder.layers[0].self_attn.attn
            assert isinstance(attn.quant_method, Fp8KVCacheMethod)
            assert attn._k_scale == 1.0
            assert attn._v_scale == 1.0
91

92
        if current_platform.has_device_capability(89) and not force_marlin:
93
94
95
96
97
98
            # For GPUs with hardware support, we keep weights in fp8
            assert fc1.weight.dtype == torch.float8_e4m3fn
        else:
            # For GPUs without hardware support, we pack the fp8 weights
            # for weight-only quantization using Marlin kernels
            assert fc1.weight.dtype == torch.int32
99
100


zhuwenwen's avatar
zhuwenwen committed
101
@pytest.mark.skipif(not is_quant_method_supported("fp8") or is_hip(),
102
                    reason="FP8 is not supported on this GPU type.")
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
def test_scaled_fp8_quant(dtype) -> None:

    def quantize_ref(tensor, inv_scale):
        # The reference implementation that fully aligns to
        # the kernel being tested.
        finfo = torch.finfo(torch.float8_e4m3fn)
        scale = inv_scale.reciprocal()
        qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min,
                                                           max=finfo.max)
        qweight = qweight.to(torch.float8_e4m3fn)
        return qweight

    def per_tensor_dequantize(tensor, inv_scale, dtype):
        fake_qweight = tensor.to(dtype)
        dq_weight = fake_qweight * inv_scale
        return dq_weight

    # Note that we use a shape % 4 != 0 to cover edge cases,
    # because scaled_fp8_quant is vectorized by 4.
    x = (torch.randn(size=(11, 11), device="cuda") * 13).to(dtype)

    # Dynamic quantization
126
    ref_y, inv_scale = ops.scaled_fp8_quant(x, None)
127
128
129
130
    ref_y = per_tensor_dequantize(ref_y, inv_scale, dtype)

    # Reference dynamic quantizaton
    y = quantize_ref(x, inv_scale)
131
132
    torch.testing.assert_close(ref_y,
                               per_tensor_dequantize(y, inv_scale, dtype))
133
134

    # Static quantization
135
    y, _ = ops.scaled_fp8_quant(x, inv_scale)
136
137
    torch.testing.assert_close(ref_y,
                               per_tensor_dequantize(y, inv_scale, dtype))
138
139

    # Padding
140
    y, _ = ops.scaled_fp8_quant(x, inv_scale, num_token_padding=17)
141
    assert y.shape[0] == 17
142
    torch.testing.assert_close(
143
144
145
        ref_y,
        per_tensor_dequantize(torch.narrow(y, 0, 0, x.shape[0]), inv_scale,
                              dtype))