test_sglang.py 15.8 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
# SPDX-License-Identifier: Apache-2.0

4
5
6
7
8
9
10
"""
Test Execution Times (Last Run: 2025-12-09):
- test_request_cancellation_sglang_aggregated: ~46s (gpu_1)
- test_request_cancellation_sglang_decode_cancel: ~60s (gpu_2, estimate)
- Total: 46.06s (0:00:46) for aggregated test only
"""

11
12
13
14
15
16
17
18
19
20
21
22
import logging
import os
import shutil
import time

import pytest

from tests.fault_tolerance.cancellation.utils import (
    DynamoFrontendProcess,
    poll_for_pattern,
    read_streaming_responses,
    send_cancellable_request,
23
24
    verify_frontend_cancellation_metrics,
    verify_runtime_cancellation_metrics,
25
26
27
28
)
from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME
from tests.utils.managed_process import ManagedProcess
from tests.utils.payloads import check_health_generate, check_models_api
29
from tests.utils.port_utils import allocate_port, deallocate_port
30
31
32

logger = logging.getLogger(__name__)

33
pytestmark = [
34
    pytest.mark.fault_tolerance,
35
36
37
    pytest.mark.sglang,
    pytest.mark.e2e,
    pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
38
    pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
39
40
]

41
42
43
44

class DynamoWorkerProcess(ManagedProcess):
    """Process manager for Dynamo worker with SGLang backend"""

45
46
47
48
49
50
51
    def __init__(
        self,
        request,
        system_port: int,
        frontend_port: int,
        mode: str = "agg",
    ):
52
53
54
55
56
        """
        Initialize SGLang worker process.

        Args:
            request: pytest request object
57
58
            system_port: Port for system metrics server
            frontend_port: Port where frontend is running
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
            mode: One of "agg", "prefill", "decode"
        """
        command = [
            "python3",
            "-m",
            "dynamo.sglang",
            "--model-path",
            FAULT_TOLERANCE_MODEL_NAME,
            "--served-model-name",
            FAULT_TOLERANCE_MODEL_NAME,
            "--page-size",
            "16",
            "--tp",
            "1",
            "--trust-remote-code",
        ]

        # Add mode-specific arguments
        if mode == "agg":
            # Aggregated mode - add skip-tokenizer-init like the serve test
            command.append("--skip-tokenizer-init")
        else:
            # Disaggregated mode - add disaggregation arguments like disagg.sh
            command.extend(
                [
                    "--disaggregation-mode",
                    mode,
                    "--disaggregation-bootstrap-port",
87
                    "12345",  # TODO: use dynamic port allocation
88
89
90
91
92
93
94
                    "--host",
                    "0.0.0.0",
                    "--disaggregation-transfer-backend",
                    "nixl",
                ]
            )

95
96
97
98
99
100
101
102
103
104
105
106
107
        # Configure health check based on worker type
        if mode in ["prefill", "decode"]:
            # Prefill and decode workers check their own status endpoint
            health_check_urls = [
                (f"http://localhost:{system_port}/health", self.is_ready)
            ]
        else:
            # Aggregated workers check both system status and frontend
            health_check_urls = [
                (f"http://localhost:{system_port}/health", self.is_ready),
                (f"http://localhost:{frontend_port}/v1/models", check_models_api),
                (f"http://localhost:{frontend_port}/health", check_health_generate),
            ]
108

109
        # Set environment variables
110
        env = os.environ.copy()
111
112
        env["DYN_REQUEST_PLANE"] = request.getfixturevalue("request_plane")

113
        env["DYN_LOG"] = "debug"
114
115
116
117
118
        # Disable canary health check - these tests expect full control over requests
        # sent to the workers where canary health check intermittently sends dummy
        # requests to workers interfering with the test process which may cause
        # intermittent failures
        env["DYN_HEALTH_CHECK_ENABLED"] = "false"
119
        env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]'
120
121
        env["DYN_SYSTEM_PORT"] = str(system_port)
        env["DYN_HTTP_PORT"] = str(frontend_port)
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146

        # Set GPU assignment for disaggregated mode (like disagg.sh)
        if mode == "decode":
            env["CUDA_VISIBLE_DEVICES"] = "1"  # Use GPU 1 for decode worker
        elif mode == "prefill":
            env["CUDA_VISIBLE_DEVICES"] = "0"  # Use GPU 0 for prefill worker
        # For agg (aggregated) mode, use default GPU assignment

        # Set log directory based on worker type
        log_dir = f"{request.node.name}_{mode}_worker"

        # Clean up any existing log directory from previous runs
        try:
            shutil.rmtree(log_dir)
            logger.info(f"Cleaned up existing log directory: {log_dir}")
        except FileNotFoundError:
            # Directory doesn't exist, which is fine
            pass

        super().__init__(
            command=command,
            env=env,
            health_check_urls=health_check_urls,
            timeout=300,
            display_output=True,
147
            terminate_all_matching_process_names=False,
148
149
150
151
152
153
154
155
156
157
158
            # Ensure any orphaned SGLang engine cores or child helpers are cleaned up
            stragglers=[
                "SGLANG:EngineCore",
            ],
            straggler_commands=[
                "-m dynamo.sglang",
            ],
            log_dir=log_dir,
        )

        self.mode = mode
159
160
161
162
163
164
165
166
167
        self.system_port = system_port

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Release allocated port when worker exits."""
        try:
            # system_port is a required parameter, always set in __init__
            deallocate_port(self.system_port)
        except Exception as e:
            logging.warning(f"Failed to release SGLang worker port: {e}")
168

169
        return super().__exit__(exc_type, exc_val, exc_tb)
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187

    def is_ready(self, response) -> bool:
        """Check the health of the worker process"""
        try:
            data = response.json()
            if data.get("status") == "ready":
                logger.info(f"{self.mode.capitalize()} worker status is ready")
                return True
            logger.warning(
                f"{self.mode.capitalize()} worker status is not ready: {data.get('status')}"
            )
        except ValueError:
            logger.warning(
                f"{self.mode.capitalize()} worker health response is not valid JSON"
            )
        return False


188
@pytest.mark.timeout(160)  # 3x average
189
@pytest.mark.gpu_1
Dmitry Tokarev's avatar
Dmitry Tokarev committed
190
@pytest.mark.skip(reason="DYN-2265")
191
@pytest.mark.nightly
192
def test_request_cancellation_sglang_aggregated(
193
    request, runtime_services_dynamic_ports, predownload_models
194
):
195
196
197
198
199
200
201
    """
    End-to-end test for request cancellation functionality in aggregated mode.

    This test verifies that when a request is cancelled by the client,
    the system properly handles the cancellation and cleans up resources
    on the worker side in aggregated (agg) mode.

202
203
204
205
206
207
208
209
210
211
    Tests 3 cancellation scenarios:
    1. Completion request
    2. Chat completion request
    3. Chat completion request (streaming)

    Timing (Last Run: 2025-12-09): ~46s total
    - Engine initialization: ~14s
    - Testing 3 scenarios: ~30s (~10s each)
    - Teardown: ~2s

212
213
214
215
    TODO: Test is currently flaky/failing due to SGLang limitations with prefill cancellation.
    See: https://github.com/sgl-project/sglang/issues/11139
    """
    logger.info("Sanity check if latest test is getting executed")
216
217
218
219
220

    # Allocate ports to avoid conflicts with parallel tests
    system_port = allocate_port(9100)

    # Step 1: Start the frontend (allocates its own port)
221
222
223
224
    with DynamoFrontendProcess(request) as frontend:
        logger.info("Frontend started successfully")

        # Step 2: Start an aggregated worker
225
226
227
228
229
230
        with DynamoWorkerProcess(
            request,
            system_port=system_port,
            frontend_port=frontend.frontend_port,
            mode="agg",
        ) as worker:
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
            logger.info(f"Aggregated Worker PID: {worker.get_pid()}")
            # TODO: Why wait after worker ready fixes frontend 404 / 500 flakiness?
            time.sleep(2)

            # Step 3: Test request cancellation with polling approach
            frontend_log_offset, worker_log_offset = 0, 0

            test_scenarios = [
                ("completion", "Completion request cancellation"),
                ("chat_completion", "Chat completion request cancellation"),
                (
                    "chat_completion_stream",
                    "Chat completion stream request cancellation",
                ),
            ]

247
            for idx, (request_type, description) in enumerate(test_scenarios):
248
249
250
                logger.info(f"Testing {description.lower()}...")

                # Send the request (non-blocking)
251
252
253
                cancellable_req = send_cancellable_request(
                    frontend.frontend_port, request_type
                )
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295

                # Poll for "New Request ID" pattern (Dynamo context ID)
                request_id, worker_log_offset = poll_for_pattern(
                    process=worker,
                    pattern="New Request ID: ",
                    log_offset=worker_log_offset,
                    match_type="contains",
                )

                # For streaming, read one response first to trigger SGLang ID logging
                if request_type == "chat_completion_stream":
                    read_streaming_responses(cancellable_req, expected_count=1)

                # Wait for SGLang to actually start processing (get SGLang request ID)
                _, worker_log_offset = poll_for_pattern(
                    process=worker,
                    pattern="New SGLang Request ID: ",
                    log_offset=worker_log_offset,
                    match_type="contains",
                )

                # Now we know SGLang has the request, cancel it
                cancellable_req.cancel()
                logger.info(f"Cancelled request ID: {request_id}")

                # Poll for "Aborted Request ID" with matching ID
                _, worker_log_offset = poll_for_pattern(
                    process=worker,
                    pattern=f"Aborted Request ID: {request_id}",
                    log_offset=worker_log_offset,
                    max_wait_ms=2000,
                )

                # Verify frontend log has kill message
                _, frontend_log_offset = poll_for_pattern(
                    process=frontend,
                    pattern="issued control message Kill to sender",
                    log_offset=frontend_log_offset,
                )

                logger.info(f"{description} detected successfully")

296
297
298
299
300
301
302
303
304
305
306
                # Verify cancellation metrics after each scenario
                verify_frontend_cancellation_metrics(
                    frontend_port=frontend.frontend_port,
                    request_type=request_type,
                    expected_count=1,
                )
                verify_runtime_cancellation_metrics(
                    worker_system_port=worker.system_port,
                    expected_count=idx + 1,
                )

307

308
@pytest.mark.timeout(300)  # 3x average
309
@pytest.mark.gpu_2
310
@pytest.mark.pre_merge
311
def test_request_cancellation_sglang_decode_cancel(
312
    request, runtime_services_dynamic_ports, predownload_models
313
):
314
    """
315
    End-to-end test for request cancellation during decode phase.
316

317
    This test verifies that when a request is cancelled by the client during the decode phase,
318
319
320
321
    the system properly handles the cancellation and cleans up resources
    on both the prefill and decode workers in a disaggregated setup.

    Note: This test requires 2 GPUs to run decode and prefill workers on separate GPUs.
322
323
324
325
326

    Timing (Last Run: 2025-12-09): ~60s total (estimated)
    - Engine initialization: ~20s (decode + prefill workers)
    - Testing stream cancellation during decode: ~38s
    - Teardown: ~2s
327
328
    """

329
330
331
332
333
    # Allocate ports to avoid conflicts with parallel tests
    decode_system_port = allocate_port(9100)
    prefill_system_port = allocate_port(9200)

    # Step 1: Start the frontend (allocates its own port)
334
335
336
337
    with DynamoFrontendProcess(request) as frontend:
        logger.info("Frontend started successfully")

        # Step 2: Start the decode worker
338
339
340
341
342
343
        with DynamoWorkerProcess(
            request,
            system_port=decode_system_port,
            frontend_port=frontend.frontend_port,
            mode="decode",
        ) as decode_worker:
344
345
346
            logger.info(f"Decode Worker PID: {decode_worker.get_pid()}")

            # Step 3: Start the prefill worker
347
348
349
350
351
352
            with DynamoWorkerProcess(
                request,
                system_port=prefill_system_port,
                frontend_port=frontend.frontend_port,
                mode="prefill",
            ) as prefill_worker:
353
354
355
356
357
                logger.info(f"Prefill Worker PID: {prefill_worker.get_pid()}")

                # TODO: Why wait after worker ready fixes frontend 404 / 500 flakiness?
                time.sleep(2)

358
                # Step 4: Test request cancellation during decode phase
359
                logger.info(
360
                    "Testing chat completion stream request cancellation during decode phase..."
361
362
363
                )

                # Send streaming request (non-blocking)
364
365
366
                cancellable_req = send_cancellable_request(
                    frontend.frontend_port, "chat_completion_stream"
                )
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411

                # Poll for "New Request ID" pattern in decode worker (Dynamo context ID)
                request_id, decode_log_offset = poll_for_pattern(
                    process=decode_worker,
                    pattern="New Request ID: ",
                    match_type="contains",
                )

                # Verify same request ID reached prefill worker
                _, prefill_log_offset = poll_for_pattern(
                    process=prefill_worker,
                    pattern=f"New Request ID: {request_id}",
                )

                # Read one response first to trigger SGLang ID logging in decode worker
                read_streaming_responses(cancellable_req, expected_count=1)

                # Wait for SGLang to start processing in decode worker
                _, decode_log_offset = poll_for_pattern(
                    process=decode_worker,
                    pattern="New SGLang Request ID: ",
                    log_offset=decode_log_offset,
                    match_type="contains",
                )

                # Now we know SGLang has the request in decode worker, cancel it
                cancellable_req.cancel()
                logger.info(f"Cancelled request ID: {request_id}")

                # Poll for "Aborted Request ID" in decode worker
                _, decode_log_offset = poll_for_pattern(
                    process=decode_worker,
                    pattern=f"Aborted Request ID: {request_id}",
                    log_offset=decode_log_offset,
                )

                # Verify frontend log has kill message
                _, frontend_log_offset = poll_for_pattern(
                    process=frontend,
                    pattern="issued control message Kill to sender",
                )

                logger.info(
                    "Chat completion stream cancellation in decode phase detected successfully"
                )
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427

                # Verify cancellation metrics
                verify_frontend_cancellation_metrics(
                    frontend_port=frontend.frontend_port,
                    request_type="chat_completion_stream",
                    expected_count=1,
                )
                verify_runtime_cancellation_metrics(
                    worker_system_port=decode_worker.system_port,
                    expected_count=1,
                )
                verify_runtime_cancellation_metrics(
                    worker_system_port=prefill_worker.system_port,
                    expected_count=0,
                    component="prefill",
                )