test_shutdown.py 3.17 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
6
7
8
import signal
import subprocess
import sys
import time

9
10
11
import openai
import pytest

12
from vllm.platforms import current_platform
13
from vllm.utils.network_utils import get_open_port
14

15
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
16

17
18
19
20
21
# GPU initialization might take take longer
_IS_ROCM = current_platform.is_rocm()
_SERVER_STARTUP_TIMEOUT = 120
_PROCESS_EXIT_TIMEOUT = 15

22
23

@pytest.mark.asyncio
24
async def test_shutdown_on_engine_failure():
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
    """Verify that API returns connection error when server process is killed.

    Starts a vLLM server, kills it to simulate a crash, then verifies that
    subsequent API calls fail appropriately.
    """

    port = get_open_port()

    proc = subprocess.Popen(
        [
            # dtype, max-len etc set so that this can run in CI
            sys.executable,
            "-m",
            "vllm.entrypoints.openai.api_server",
            "--model",
            MODEL_NAME,
            "--dtype",
            "bfloat16",
            "--max-model-len",
            "128",
            "--enforce-eager",
            "--port",
            str(port),
            "--gpu-memory-utilization",
            "0.05",
            "--max-num-seqs",
            "2",
            "--disable-frontend-multiprocessing",
        ],
54
55
56
57
58
        # ROCm: Disable stdout/stderr pipe capture. Subprocess hangs when
        # stdout/stderr pipes are enabled during ROCm GPU initialization.
        stdout=None if _IS_ROCM else subprocess.PIPE,
        stderr=None if _IS_ROCM else subprocess.PIPE,
        text=None if _IS_ROCM else True,
59
60
61
62
63
64
65
66
67
68
69
70
71
        preexec_fn=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN),
    )

    # Wait for server startup
    start_time = time.time()
    client = openai.AsyncOpenAI(
        base_url=f"http://localhost:{port}/v1",
        api_key="dummy",
        max_retries=0,
        timeout=10,
    )

    # Poll until server is ready
72
    while time.time() - start_time < _SERVER_STARTUP_TIMEOUT:
73
74
75
76
77
78
79
80
        try:
            await client.completions.create(
                model=MODEL_NAME, prompt="Hello", max_tokens=1
            )
            break
        except Exception:
            time.sleep(0.5)
            if proc.poll() is not None:
81
82
83
84
85
86
87
88
                if _IS_ROCM:
                    pytest.fail(f"Server died during startup: {proc.returncode}")
                else:
                    stdout, stderr = proc.communicate(timeout=1)
                    pytest.fail(
                        f"Server died during startup. "
                        f"stdout: {stdout}, stderr: {stderr}"
                    )
89
90
    else:
        proc.terminate()
91
92
        proc.wait(timeout=_PROCESS_EXIT_TIMEOUT)
        pytest.fail(f"Server failed to start in {_SERVER_STARTUP_TIMEOUT} seconds")
93
94
95
96
97
98
99
100
101
102

    # Kill server to simulate crash
    proc.terminate()
    time.sleep(1)

    # Verify API calls now fail
    with pytest.raises((openai.APIConnectionError, openai.APIStatusError)):
        await client.completions.create(
            model=MODEL_NAME, prompt="This should fail", max_tokens=1
        )
103

104
    return_code = proc.wait(timeout=_PROCESS_EXIT_TIMEOUT)
105
    assert return_code is not None