test_accuracy.py 3.62 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
8
9
10
11
"""
This file test accuracy of the vLLM server via LMEval.
It uses local-completions, which interacts with vLLM
through the OAI API with N concurrent connections.
This simulates real work usage of the API and makes
sure that the zmq frontend mp RPC message passing and
AsyncLLMEngine are working correctly.
"""

zhuwenwen's avatar
zhuwenwen committed
12
import os
13
14
15
16
import lm_eval
import pytest

from vllm.platforms import current_platform
zhuwenwen's avatar
zhuwenwen committed
17
from ...utils import models_path_prefix
18

19
MODEL_NAMES = [
20
    os.path.join(models_path_prefix, "Qwen/Qwen3-1.7B"),
zhuwenwen's avatar
zhuwenwen committed
21
    os.path.join(models_path_prefix, "google/gemma-3-1b-it"),
22
]
23
FP8_KV_MODEL_NAMES = [
24
    os.path.join(models_path_prefix, "Qwen/Qwen3-1.7B"),
25
]
26
27
28
29
NUM_CONCURRENT = 500
TASK = "gsm8k"
FILTER = "exact_match,strict-match"
RTOL = 0.03
30
EXPECTED_VALUES = {
31
32
    os.path.join(models_path_prefix, "Qwen/Qwen3-1.7B"): 0.68,
    os.path.join(models_path_prefix, "google/gemma-3-1b-it"): 0.25,
33
}
34
35


36
def run_test(model_name, more_args=None):
37
38
    """Run the end to end accuracy test."""

39
    model_args = f"pretrained={model_name},max_model_len=4096"
40
41
42

    if more_args is not None:
        model_args = "{},{}".format(model_args, more_args)
43
44
45
46
47
48
49
50
51

    results = lm_eval.simple_evaluate(
        model="vllm",
        model_args=model_args,
        tasks="gsm8k",
        batch_size="auto",
    )

    measured_value = results["results"][TASK][FILTER]
52
53
54
55
56
57
    assert model_name in EXPECTED_VALUES, (
        f"Cannot find the expected value for the model {model_name=}")
    expected_value = EXPECTED_VALUES[model_name]
    assert (measured_value - RTOL < expected_value
            and measured_value + RTOL > expected_value
            ), f"Expected: {expected_value} |  Measured: {measured_value}"
58
59


60
61
62
63
# TODO: [AlexM] Fix it with new CI/CD tests
TPU_TP_TEST_STR = ""  #"tensor_parallel_size=4"


64
65
66
@pytest.mark.skipif(not current_platform.is_cuda()
                    and not current_platform.is_tpu(),
                    reason="V1 is currently only supported on CUDA and TPU")
67
68
@pytest.mark.parametrize("model", MODEL_NAMES)
def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
69
70
71
72
    """Run with the V1 Engine."""

    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
73
74
75
76

        more_args = None
        if current_platform.is_tpu():
            # Limit compilation time for TPU V1
77

78
79
80
            # xet doesn't work well for both Qwen/Qwen3-1.7B and
            # google/gemma-3-1b-it
            m.setenv("HF_HUB_DISABLE_XET", "1")
81
            more_args = "max_model_len=2048,max_num_seqs=64"
82

83
84
85
86
            # Add TP test (if provided)
            if TPU_TP_TEST_STR:
                more_args += ",{}".format(TPU_TP_TEST_STR)

87
        run_test(model, more_args)
88
89


90
91
92
93
94
95
96
@pytest.mark.skipif(not current_platform.is_cuda()
                    and not current_platform.is_tpu(),
                    reason="V1 is currently only supported on CUDA and TPU")
@pytest.mark.parametrize("model", FP8_KV_MODEL_NAMES)
def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(
        model, monkeypatch: pytest.MonkeyPatch):
    """Run with the V1 Engine."""
97
98

    with monkeypatch.context() as m:
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
        m.setenv("VLLM_USE_V1", "1")

        more_args = None
        if current_platform.is_tpu():
            # Limit compilation time for TPU V1

            # xet doesn't work well for Qwen/Qwen3-1.7B
            m.setenv("HF_HUB_DISABLE_XET", "1")
            more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"

            # Add TP test (if provided)
            if TPU_TP_TEST_STR:
                more_args += ",{}".format(TPU_TP_TEST_STR)

        run_test(model, more_args)