test_accuracy.py 3.1 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
8
9
10
11
12
13
14
15
16
"""
This file test accuracy of the vLLM server via LMEval.
It uses local-completions, which interacts with vLLM
through the OAI API with N concurrent connections.
This simulates real work usage of the API and makes
sure that the zmq frontend mp RPC message passing and
AsyncLLMEngine are working correctly.
"""

import lm_eval
import pytest

from vllm.platforms import current_platform

17
MODEL_NAMES = [
18
    "Qwen/Qwen3-1.7B",
19
20
    "google/gemma-3-1b-it",
]
21
22
23
FP8_KV_MODEL_NAMES = [
    "Qwen/Qwen3-1.7B",
]
24
25
26
27
NUM_CONCURRENT = 500
TASK = "gsm8k"
FILTER = "exact_match,strict-match"
RTOL = 0.03
28
EXPECTED_VALUES = {
29
    "Qwen/Qwen3-1.7B": 0.68,
30
31
    "google/gemma-3-1b-it": 0.25,
}
32
33


34
def run_test(model_name, more_args=None):
35
36
    """Run the end to end accuracy test."""

37
    model_args = f"pretrained={model_name},max_model_len=4096"
38
39
40

    if more_args is not None:
        model_args = "{},{}".format(model_args, more_args)
41
42
43
44
45
46
47
48
49

    results = lm_eval.simple_evaluate(
        model="vllm",
        model_args=model_args,
        tasks="gsm8k",
        batch_size="auto",
    )

    measured_value = results["results"][TASK][FILTER]
50
    assert model_name in EXPECTED_VALUES, (
51
52
        f"Cannot find the expected value for the model {model_name=}"
    )
53
    expected_value = EXPECTED_VALUES[model_name]
54
55
56
57
    assert (
        measured_value - RTOL < expected_value
        and measured_value + RTOL > expected_value
    ), f"Expected: {expected_value} |  Measured: {measured_value}"
58
59


60
# TODO: [AlexM] Fix it with new CI/CD tests
61
TPU_TP_TEST_STR = ""  # "tensor_parallel_size=4"
62
63


64
65
66
67
@pytest.mark.skipif(
    not current_platform.is_cuda() and not current_platform.is_tpu(),
    reason="V1 is currently only supported on CUDA and TPU",
)
68
69
@pytest.mark.parametrize("model", MODEL_NAMES)
def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
70
71
72
73
    """Run with the V1 Engine."""

    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
74
75
76
77

        more_args = None
        if current_platform.is_tpu():
            # Limit compilation time for TPU V1
78

79
            more_args = "max_model_len=2048,max_num_seqs=64"
80

81
82
83
84
            # Add TP test (if provided)
            if TPU_TP_TEST_STR:
                more_args += ",{}".format(TPU_TP_TEST_STR)

85
        run_test(model, more_args)
86
87


88
89
90
91
@pytest.mark.skipif(
    not current_platform.is_cuda() and not current_platform.is_tpu(),
    reason="V1 is currently only supported on CUDA and TPU",
)
92
93
@pytest.mark.parametrize("model", FP8_KV_MODEL_NAMES)
def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(
94
95
    model, monkeypatch: pytest.MonkeyPatch
):
96
    """Run with the V1 Engine."""
97
98

    with monkeypatch.context() as m:
99
100
101
102
103
104
105
106
107
108
109
110
        m.setenv("VLLM_USE_V1", "1")

        more_args = None
        if current_platform.is_tpu():
            # Limit compilation time for TPU V1
            more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"

            # Add TP test (if provided)
            if TPU_TP_TEST_STR:
                more_args += ",{}".format(TPU_TP_TEST_STR)

        run_test(model, more_args)