"vllm/vscode:/vscode.git/clone" did not exist on "e4ae148a787df846beb194078c35655c44784bd5"
utils.py 7.07 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import logging
import threading
import time

import pytest
import requests

from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME
12
13
14
from tests.utils.managed_process import (
    DynamoFrontendProcess as BaseDynamoFrontendProcess,
)
15
16
17
18
19
from tests.utils.managed_process import ManagedProcess

logger = logging.getLogger(__name__)


20
21
class DynamoFrontendProcess(BaseDynamoFrontendProcess):
    """Fault-tolerance frontend wrapper (keeps env settings from the historical helper)."""
22
23

    def __init__(self, request):
24
25
26
27
28
29
        extra_env = {
            "DYN_REQUEST_PLANE": request.getfixturevalue("request_plane"),
            # These tests expect full control over requests sent to workers. The canary
            # health check can inject extra requests and cause intermittent failures.
            "DYN_HEALTH_CHECK_ENABLED": "false",
        }
30
        super().__init__(
31
32
33
34
35
            request,
            frontend_port=0,  # allocate a free port (xdist-safe)
            router_mode="round-robin",
            extra_env=extra_env,
            terminate_existing=False,
36
37
        )

38
39

def start_completion_request(frontend_port: int) -> tuple:
40
41
42
    """
    Start a long-running completion request in a separate thread.

43
44
45
    Args:
        frontend_port: Port where the frontend is running

46
47
48
49
50
51
52
    Returns:
        tuple: (request_thread, response_list)
    """
    response_list = []  # Thread safe is not required as only one thread writes to it

    def send_request():
        prompt = "Tell me a long long long story about yourself?"
53
        max_tokens = 8000
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
        timeout = 240  # Extended timeout for long request

        payload = {
            "model": FAULT_TOLERANCE_MODEL_NAME,
            "prompt": prompt,
            "max_tokens": max_tokens,
        }

        headers = {"Content-Type": "application/json"}

        logger.info(
            f"Sending completion request with prompt: '{prompt[:50]}...' and max_tokens: {max_tokens}"
        )

        try:
            response = requests.post(
70
                f"http://localhost:{frontend_port}/v1/completions",
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
                headers=headers,
                json=payload,
                timeout=timeout,
            )
            logger.info(f"Received response with status code: {response.status_code}")
            response_list.append(response)
        except Exception as e:
            logger.error(f"Request failed with error: {e}")

    request_thread = threading.Thread(target=send_request, daemon=True)
    request_thread.start()

    return request_thread, response_list


def determine_request_receiving_worker(
    worker1: ManagedProcess, worker2: ManagedProcess, receiving_pattern: str
) -> tuple:
    """
    Determine which worker received the request using parallel polling.

    Args:
        worker1: First worker process
        worker2: Second worker process
        receiving_pattern: Log pattern indicating request receipt

    Returns:
        Tuple of (worker_with_request, name_of_worker_with_request)
    """
    worker1_results: list[bool] = []
    worker2_results: list[bool] = []

    # Poll both workers in parallel
    def poll_worker(worker: ManagedProcess, result_list: list[bool]):
        max_wait_ms = 500
        poll_interval_ms = 5
        max_iterations = max_wait_ms // poll_interval_ms
        iteration = 0

        while iteration < max_iterations:
            # Check if the worker logs contain 'New Request ID:' message
            try:
                with open(worker.log_path, "r") as f:
                    log_content = f.read()
                    if receiving_pattern in log_content:
                        result_list.append(True)
                        return
            except Exception as e:
                logger.error(f"Could not read log file {worker.log_path}: {e}")
                return

            time.sleep(poll_interval_ms / 1000.0)
            iteration += 1

    # Look for which worker received the request
    thread1 = threading.Thread(
        target=poll_worker, args=(worker1, worker1_results), daemon=True
    )
    thread2 = threading.Thread(
        target=poll_worker, args=(worker2, worker2_results), daemon=True
    )
    thread1.start()
    thread2.start()
    thread1.join(timeout=1)
    thread2.join(timeout=1)

    # Get results from lists
    worker1_received = worker1_results[0] if worker1_results else False
    worker2_received = worker2_results[0] if worker2_results else False

    if worker1_received and not worker2_received:
        logger.info("Request was received by Worker 1")
        return worker1, "Worker 1"
    elif worker2_received and not worker1_received:
        logger.info("Request was received by Worker 2")
        return worker2, "Worker 2"
    elif worker1_received and worker2_received:
        pytest.fail("Both workers received the request")
    else:
        pytest.fail("Neither worker received the request")


def validate_completion_response(
    request_thread: threading.Thread, response_list: list
) -> None:
    """
    Wait for and validate the completion response after worker failure.

    Args:
        request_thread: The thread running the completion request
        response_list: List containing the response from the request
    """
    request_thread.join(timeout=240)
    if request_thread.is_alive():
        pytest.fail("Request did not complete within 240 seconds")

    # Get the response
    if len(response_list) != 1:
        pytest.fail(f"Received {len(response_list)} responses, expected 1")
    response = response_list[0]

    assert (
        response.status_code == 200
    ), f"Request failed with status {response.status_code}: {response.text}"

    try:
        data = response.json()
    except ValueError:
        pytest.fail(f"Response is not valid JSON: {response.text}")

    # Validate OpenAI completion response structure
    assert "choices" in data, f"Response missing 'choices' field: {data}"
    assert len(data["choices"]) > 0, f"Response has empty 'choices': {data}"
    assert "text" in data["choices"][0], f"Response choice missing 'text' field: {data}"
    assert data["choices"][0]["text"], f"Response text is empty: {data}"

    logger.info(
        f"Received valid completion response: {data['choices'][0]['text'][:100]}..."
    )
    logger.info("Request completed successfully")


def verify_migration_occurred(frontend_process: DynamoFrontendProcess) -> None:
    """
    Verify that migration occurred by checking frontend logs for stream disconnection message.

    Args:
        frontend_process: The frontend process to check logs for
    """
    log_path = frontend_process.log_path
    try:
        with open(log_path, "r") as f:
            log_content = f.read()
    except Exception as e:
        pytest.fail(f"Could not read frontend log file {log_path}: {e}")
    assert (
        "Stream disconnected... recreating stream..." in log_content
    ), "'Stream disconnected... recreating stream...' message not found in logs"
    assert (
        "Cannot recreate stream: " not in log_content
    ), "'Cannot recreate stream: ...' error found in logs"