test_sglang.py 11.6 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
# SPDX-License-Identifier: Apache-2.0

4
import dataclasses
5
6
import logging
import os
7
from dataclasses import dataclass, field
8
9
10

import pytest

Alec's avatar
Alec committed
11
12
from tests.serve.common import (
    SERVE_TEST_DIR,
13
    WORKSPACE_DIR,
Alec's avatar
Alec committed
14
15
16
    params_with_model_mark,
    run_serve_deployment,
)
17
from tests.utils.constants import DefaultPort
18
from tests.utils.engine_process import EngineConfig
19
20
21
22
from tests.utils.payload_builder import (
    chat_payload,
    chat_payload_default,
    completion_payload_default,
23
24
    embedding_payload,
    embedding_payload_default,
25
    metric_payload_default,
26
27
    responses_payload_default,
    responses_stream_payload_default,
28
)
29
30
31
32
33

logger = logging.getLogger(__name__)


@dataclass
34
class SGLangConfig(EngineConfig):
35
36
    """Configuration for SGLang test scenarios"""

37
    stragglers: list[str] = field(default_factory=lambda: ["SGLANG:EngineCore"])
38
39


40
sglang_dir = os.environ.get("SGLANG_DIR") or os.path.join(
41
    WORKSPACE_DIR, "examples/backends/sglang"
42
)
43

44
45
46
# SGLang test configurations
# NOTE: pytest.mark.gpu_1 tests take ~167s (2m 47s) total to run sequentially (with models pre-cached)
# TODO: Parallelize these tests to reduce total execution time
47
48
sglang_configs = {
    "aggregated": SGLangConfig(
49
50
        # Uses backend agg.sh (with metrics enabled) for testing standard
        # aggregated deployment with metrics collection
51
        name="aggregated",
52
53
        directory=sglang_dir,
        script_name="agg.sh",
54
55
56
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
57
            pytest.mark.timeout(240),  # 3x measured time (39s) + download time (120s)
58
        ],
59
        model="Qwen/Qwen3-0.6B",
60
        env={},
61
        frontend_port=DefaultPort.FRONTEND.value,
62
63
64
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
65
66
            responses_payload_default(),
            responses_stream_payload_default(),
67
            metric_payload_default(min_num_requests=6, backend="sglang"),
68
        ],
69
70
    ),
    "disaggregated": SGLangConfig(
71
72
73
        name="disaggregated",
        directory=sglang_dir,
        script_name="disagg.sh",
74
        marks=[pytest.mark.gpu_2, pytest.mark.post_merge],
75
76
        model="Qwen/Qwen3-0.6B",
        env={},
77
        frontend_port=DefaultPort.FRONTEND.value,
78
79
80
81
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
82
    ),
83
84
    "disaggregated_same_gpu": SGLangConfig(
        # Uses disagg_same_gpu.sh for single-GPU disaggregated testing
85
86
        # Validates metrics from both prefill (DefaultPort.SYSTEM1) and decode
        # (DefaultPort.SYSTEM2) workers
87
88
89
        name="disaggregated_same_gpu",
        directory=sglang_dir,
        script_name="disagg_same_gpu.sh",
90
91
92
93
94
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.skip(reason="unstable"),
        ],
95
96
        model="Qwen/Qwen3-0.6B",
        env={},
97
        frontend_port=DefaultPort.FRONTEND.value,
98
99
100
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
101
102
103
104
105
106
107
108
109
110
111
112
113
114
            # Validate dynamo_component_* and sglang:* metrics from prefill worker
            # (DefaultPort.SYSTEM1)
            metric_payload_default(
                min_num_requests=6,
                backend="sglang",
                port=DefaultPort.SYSTEM1.value,
            ),
            # Validate dynamo_component_* and sglang:* metrics from decode worker
            # (DefaultPort.SYSTEM2)
            metric_payload_default(
                min_num_requests=6,
                backend="sglang",
                port=DefaultPort.SYSTEM2.value,
            ),
115
116
        ],
    ),
117
    "kv_events": SGLangConfig(
118
119
120
        name="kv_events",
        directory=sglang_dir,
        script_name="agg_router.sh",
121
        marks=[pytest.mark.gpu_2, pytest.mark.post_merge],
122
123
124
125
        model="Qwen/Qwen3-0.6B",
        env={
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info",
        },
126
        frontend_port=DefaultPort.FRONTEND.value,
127
128
129
        request_payloads=[
            chat_payload_default(
                expected_log=[
130
                    r"ZMQ listener .* received batch with \d+ events \(seq=\d+(?:, [^)]*)?\)",
131
                    r"Event processor for worker_id \d+ processing event: Stored\(",
Yan Ru Pei's avatar
Yan Ru Pei committed
132
                    r"Selected worker: worker_id=\d+ dp_rank=.*?, logit: ",
133
134
135
                ]
            )
        ],
136
    ),
137
138
139
140
141
    "template_verification": SGLangConfig(
        # Tests custom jinja template preprocessing by verifying the template
        # marker 'CUSTOM_TEMPLATE_ACTIVE|' is applied to user messages.
        # The backend (launch/template_verifier.*) checks for this marker
        # and returns "Successfully Applied Chat Template" if found.
142
143
        # Uses SERVE_TEST_DIR (not sglang_dir) because template_verifier.sh/.py
        # are test-specific mock scripts in tests/serve/launch/
144
        name="template_verification",
145
        directory=SERVE_TEST_DIR,  # special directory for test-specific scripts
146
        script_name="template_verifier.sh",
147
148
149
150
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.nightly,
151
            pytest.mark.timeout(240),  # 3x measured time (20s) + download time (180s)
152
        ],
153
154
        model="Qwen/Qwen3-0.6B",
        env={},
155
        frontend_port=DefaultPort.FRONTEND.value,
156
157
158
159
160
161
        request_payloads=[
            chat_payload_default(
                expected_response=["Successfully Applied Chat Template"]
            )
        ],
    ),
162
163
164
    "multimodal_epd_qwen": SGLangConfig(
        # E/PD architecture: Encode worker (GPU 0) + Prefill/Decode worker (GPU 1)
        name="multimodal_epd_qwen",
165
        directory=sglang_dir,
166
        script_name="multimodal_epd.sh",
167
        marks=[pytest.mark.gpu_2, pytest.mark.nightly],
168
169
170
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        delayed_start=0,
        timeout=360,
171
        frontend_port=DefaultPort.FRONTEND.value,
172
173
174
175
176
177
178
179
180
181
182
183
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
184
185
186
                # NOTE: The response text may mention 'bus', 'train', 'streetcar', etc.
                # so we need something consistently found in the response, or a different
                # approach to validation for this test to be stable.
187
                expected_response=["image"],
188
189
190
191
                temperature=0.0,
            )
        ],
    ),
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
    "multimodal_agg_qwen": SGLangConfig(
        # Tests single-process aggregated multimodal inference using DecodeWorkerHandler
        # with in-process vision encoding (no separate encode worker)
        name="multimodal_agg_qwen",
        directory=sglang_dir,
        script_name="agg.sh",
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.nightly,
            pytest.mark.timeout(300),
        ],
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        script_args=[
            "--model-path",
            "Qwen/Qwen2.5-VL-7B-Instruct",
            "--chat-template",
            "qwen2-vl",
        ],
        delayed_start=0,
        timeout=360,
        frontend_port=DefaultPort.FRONTEND.value,
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["image"],
                temperature=0.0,
                max_tokens=100,
            )
        ],
    ),
232
233
234
235
    "embedding_agg": SGLangConfig(
        name="embedding_agg",
        directory=sglang_dir,
        script_name="agg_embed.sh",
236
237
238
239
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.nightly,
240
            pytest.mark.timeout(270),  # 3x measured time (29s) + download time (180s)
241
        ],
242
243
        model="Qwen/Qwen3-Embedding-4B",
        delayed_start=0,
244
        frontend_port=DefaultPort.FRONTEND.value,
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
        request_payloads=[
            # Test default payload with multiple inputs
            embedding_payload_default(
                repeat_count=2,
                expected_response=["Generated 2 embeddings with dimension"],
            ),
            # Test single string input
            embedding_payload(
                input_text="Hello, world!",
                repeat_count=1,
                expected_response=["Generated 1 embeddings with dimension"],
            ),
            # Test multiple string inputs
            embedding_payload(
                input_text=[
                    "The quick brown fox jumps over the lazy dog.",
                    "Machine learning is transforming technology.",
                    "Natural language processing enables computers to understand text.",
                ],
                repeat_count=1,
                expected_response=["Generated 3 embeddings with dimension"],
            ),
        ],
    ),
269
270
271
272
    "completions_only": SGLangConfig(
        name="completions_only",
        directory=sglang_dir,
        script_name="agg.sh",
273
274
        marks=[
            pytest.mark.gpu_1,
275
            pytest.mark.post_merge,
276
            pytest.mark.timeout(
277
278
                420
            ),  # Total test timeout: 2x measured average (79.36s) + download time (240s) for 7B model
279
        ],
280
281
282
283
284
285
286
287
288
289
290
        model="deepseek-ai/deepseek-llm-7b-base",
        script_args=[
            "--model-path",
            "deepseek-ai/deepseek-llm-7b-base",
            "--dyn-endpoint-types",
            "completions",
        ],
        request_payloads=[
            completion_payload_default(),
        ],
    ),
291
292
293
}


Alec's avatar
Alec committed
294
@pytest.fixture(params=params_with_model_mark(sglang_configs))
295
296
297
298
299
300
301
def sglang_config_test(request):
    """Fixture that provides different SGLang test configurations"""
    return sglang_configs[request.param]


@pytest.mark.e2e
@pytest.mark.sglang
302
303
304
# Use 2 system ports because some `sglang_configs` validate metrics on multiple ports.
# This test iterates over all configs via `sglang_config_test`.
@pytest.mark.parametrize("num_system_ports", [2], indirect=True)
Alec's avatar
Alec committed
305
def test_sglang_deployment(
306
307
308
309
    sglang_config_test,
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
310
    num_system_ports,
311
    predownload_models,
Alec's avatar
Alec committed
312
):
313
    """Test SGLang deployment scenarios using common helpers"""
314
315
316
    assert (
        num_system_ports >= 2
    ), "serve tests require at least SYSTEM_PORT1 + SYSTEM_PORT2"
317
318
319
320
    config = dataclasses.replace(
        sglang_config_test, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
321
322


323
324
@pytest.mark.e2e
@pytest.mark.sglang
325
@pytest.mark.gpu_2
326
@pytest.mark.nightly
327
328
329
@pytest.mark.skip(
    reason="Requires 4 GPUs - enable when hardware is consistently available"
)
330
331
332
def test_sglang_disagg_dp_attention(
    request, runtime_services_dynamic_ports, dynamo_dynamic_ports, predownload_models
):
333
334
    """Test sglang disaggregated with DP attention (requires 4 GPUs)"""

335
    # Kept for reference; this test uses a different launch path and is skipped