test_basic.py 2.7 KB
Newer Older
1
from http import HTTPStatus
2
from typing import List
3
4
5

import openai
import pytest
6
import pytest_asyncio
7
8
9
10
11
12
13
14
15
import requests

from vllm.version import __version__ as VLLM_VERSION

from ...utils import RemoteOpenAIServer

MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"


16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
@pytest.fixture(scope='module')
def server_args(request: pytest.FixtureRequest) -> List[str]:
    """ Provide extra arguments to the server via indirect parametrization

    Usage:

    >>> @pytest.mark.parametrize(
    >>>     "server_args",
    >>>     [
    >>>         ["--disable-frontend-multiprocessing"],
    >>>         [
    >>>             "--model=NousResearch/Hermes-3-Llama-3.1-70B",
    >>>             "--enable-auto-tool-choice",
    >>>         ],
    >>>     ],
    >>>     indirect=True,
    >>> )
    >>> def test_foo(server, client):
    >>>     ...

    This will run `test_foo` twice with servers with:
    - `--disable-frontend-multiprocessing`
    - `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.

    """
    if not hasattr(request, "param"):
        return []

    val = request.param

    if isinstance(val, str):
        return [val]

    return request.param


52
@pytest.fixture(scope="module")
53
def server(server_args):
54
55
56
57
58
59
60
61
62
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "bfloat16",
        "--max-model-len",
        "8192",
        "--enforce-eager",
        "--max-num-seqs",
        "128",
63
        *server_args,
64
65
66
67
68
69
    ]

    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server


70
71
72
73
@pytest_asyncio.fixture
async def client(server):
    async with server.get_async_client() as async_client:
        yield async_client
74
75


76
77
78
79
80
81
82
83
84
@pytest.mark.parametrize(
    "server_args",
    [
        pytest.param([], id="default-frontend-multiprocessing"),
        pytest.param(["--disable-frontend-multiprocessing"],
                     id="disable-frontend-multiprocessing")
    ],
    indirect=True,
)
85
86
87
88
89
90
91
92
93
94
@pytest.mark.asyncio
async def test_show_version(client: openai.AsyncOpenAI):
    base_url = str(client.base_url)[:-3].strip("/")

    response = requests.get(base_url + "/version")
    response.raise_for_status()

    assert response.json() == {"version": VLLM_VERSION}


95
96
97
98
99
100
101
102
103
@pytest.mark.parametrize(
    "server_args",
    [
        pytest.param([], id="default-frontend-multiprocessing"),
        pytest.param(["--disable-frontend-multiprocessing"],
                     id="disable-frontend-multiprocessing")
    ],
    indirect=True,
)
104
105
106
107
108
109
110
@pytest.mark.asyncio
async def test_check_health(client: openai.AsyncOpenAI):
    base_url = str(client.base_url)[:-3].strip("/")

    response = requests.get(base_url + "/health")

    assert response.status_code == HTTPStatus.OK