test_accuracy.py 2.5 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
3
4
5
6
7
8
9
10
11
12
13
"""
This file test accuracy of the vLLM server via LMEval.
It uses local-completions, which interacts with vLLM
through the OAI API with N concurrent connections.
This simulates real work usage of the API and makes
sure that the zmq frontend mp RPC message passing and
AsyncLLMEngine are working correctly.
"""

import lm_eval
import pytest

14
15
from vllm.platforms import current_platform

16
17
18
19
20
21
22
23
from ...utils import RemoteOpenAIServer

MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
NUM_CONCURRENT = 500
TASK = "gsm8k"
FILTER = "exact_match,strict-match"
RTOL = 0.03
EXPECTED_VALUE = 0.58
24
DEFAULT_ARGS = ["--max-model-len", "2048", "--disable-log-requests"]
25
MORE_ARGS_LIST = [
26
    [],  # Default
27
28
29
30
    ["--enable-chunked-prefill"],  # Chunked
    ["--num-scheduler-steps", "8"],  # MS
    ["--num-scheduler-steps", "8", "--multi-step-stream-outputs"]  # MS+Stream
]
31
32
33
34
35
36
37
38
MAX_WAIT_SECONDS = None

if current_platform.is_tpu():
    MORE_ARGS_LIST = [
        [],  # Default
        # ["--num-scheduler-steps", "8"], # Multi-step << currently fails
    ]
    MAX_WAIT_SECONDS = 600
39
40


41
42
43
def run_test(more_args):
    """Run the end to end accuracy test."""

44
45
46
    args = list(DEFAULT_ARGS)
    args.extend(more_args)
    print(f"Running with: {args}")
47

48
49
50
    with RemoteOpenAIServer(
            MODEL_NAME, args,
            max_wait_seconds=MAX_WAIT_SECONDS) as remote_server:
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
        url = f"{remote_server.url_for('v1')}/completions"

        model_args = (
            f"model={MODEL_NAME},"
            f"base_url={url},"
            f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")

        results = lm_eval.simple_evaluate(
            model="local-completions",
            model_args=model_args,
            tasks=TASK,
        )

        measured_value = results["results"][TASK][FILTER]
        assert (measured_value - RTOL < EXPECTED_VALUE
                and measured_value + RTOL > EXPECTED_VALUE
                ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86


@pytest.mark.skipif(not current_platform.is_cuda(),
                    reason="V1 currently only supported on CUDA")
def test_lm_eval_accuracy_v1_engine(monkeypatch):
    """Run with the V1 Engine."""

    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
        run_test([])


@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args):
    """Run with the V0 Engine."""

    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "0")
        run_test(more_args)