test_sglang.py 12.2 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
# SPDX-License-Identifier: Apache-2.0

4
import dataclasses
5
6
import logging
import os
7
from dataclasses import dataclass, field
8
9
10

import pytest

Alec's avatar
Alec committed
11
12
from tests.serve.common import (
    SERVE_TEST_DIR,
13
    WORKSPACE_DIR,
Alec's avatar
Alec committed
14
15
16
    params_with_model_mark,
    run_serve_deployment,
)
17
from tests.utils.constants import DefaultPort
18
from tests.utils.engine_process import EngineConfig
19
from tests.utils.payload_builder import (
20
21
    anthropic_messages_payload_default,
    anthropic_messages_stream_payload_default,
22
23
24
    chat_payload,
    chat_payload_default,
    completion_payload_default,
25
26
    embedding_payload,
    embedding_payload_default,
27
    metric_payload_default,
28
29
    responses_payload_default,
    responses_stream_payload_default,
30
)
31
32
33
34
35

logger = logging.getLogger(__name__)


@dataclass
36
class SGLangConfig(EngineConfig):
37
38
    """Configuration for SGLang test scenarios"""

39
    stragglers: list[str] = field(default_factory=lambda: ["SGLANG:EngineCore"])
40
41


42
sglang_dir = os.environ.get("SGLANG_DIR") or os.path.join(
43
    WORKSPACE_DIR, "examples/backends/sglang"
44
)
45

46
47
48
# SGLang test configurations
# NOTE: pytest.mark.gpu_1 tests take ~167s (2m 47s) total to run sequentially (with models pre-cached)
# TODO: Parallelize these tests to reduce total execution time
49
50
sglang_configs = {
    "aggregated": SGLangConfig(
51
52
        # Uses backend agg.sh (with metrics enabled) for testing standard
        # aggregated deployment with metrics collection
53
        name="aggregated",
54
55
        directory=sglang_dir,
        script_name="agg.sh",
56
57
58
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
59
            pytest.mark.timeout(240),  # 3x measured time (39s) + download time (120s)
60
        ],
61
        model="Qwen/Qwen3-0.6B",
62
        env={},
63
        frontend_port=DefaultPort.FRONTEND.value,
64
65
66
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
67
68
            responses_payload_default(),
            responses_stream_payload_default(),
69
            metric_payload_default(min_num_requests=6, backend="sglang"),
70
        ],
71
72
    ),
    "disaggregated": SGLangConfig(
73
74
75
        name="disaggregated",
        directory=sglang_dir,
        script_name="disagg.sh",
76
        marks=[pytest.mark.gpu_2, pytest.mark.post_merge],
77
78
        model="Qwen/Qwen3-0.6B",
        env={},
79
        frontend_port=DefaultPort.FRONTEND.value,
80
81
82
83
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
84
    ),
85
86
    "disaggregated_same_gpu": SGLangConfig(
        # Uses disagg_same_gpu.sh for single-GPU disaggregated testing
87
88
        # Validates metrics from both prefill (DefaultPort.SYSTEM1) and decode
        # (DefaultPort.SYSTEM2) workers
89
90
91
        name="disaggregated_same_gpu",
        directory=sglang_dir,
        script_name="disagg_same_gpu.sh",
92
93
94
95
96
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.skip(reason="unstable"),
        ],
97
98
        model="Qwen/Qwen3-0.6B",
        env={},
99
        frontend_port=DefaultPort.FRONTEND.value,
100
101
102
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
103
104
105
106
107
108
109
110
111
112
113
114
115
116
            # Validate dynamo_component_* and sglang:* metrics from prefill worker
            # (DefaultPort.SYSTEM1)
            metric_payload_default(
                min_num_requests=6,
                backend="sglang",
                port=DefaultPort.SYSTEM1.value,
            ),
            # Validate dynamo_component_* and sglang:* metrics from decode worker
            # (DefaultPort.SYSTEM2)
            metric_payload_default(
                min_num_requests=6,
                backend="sglang",
                port=DefaultPort.SYSTEM2.value,
            ),
117
118
        ],
    ),
119
    "kv_events": SGLangConfig(
120
121
122
        name="kv_events",
        directory=sglang_dir,
        script_name="agg_router.sh",
123
        marks=[pytest.mark.gpu_2, pytest.mark.post_merge],
124
125
        model="Qwen/Qwen3-0.6B",
        env={
126
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
127
        },
128
        frontend_port=DefaultPort.FRONTEND.value,
129
130
131
        request_payloads=[
            chat_payload_default(
                expected_log=[
132
                    r"ZMQ listener .* received batch with \d+ events \(seq=\d+(?:, [^)]*)?\)",
133
                    r"Event processor for worker_id \d+ processing event: Stored\(",
Yan Ru Pei's avatar
Yan Ru Pei committed
134
                    r"Selected worker: worker_id=\d+ dp_rank=.*?, logit: ",
135
136
137
                ]
            )
        ],
138
    ),
139
140
141
142
143
    "template_verification": SGLangConfig(
        # Tests custom jinja template preprocessing by verifying the template
        # marker 'CUSTOM_TEMPLATE_ACTIVE|' is applied to user messages.
        # The backend (launch/template_verifier.*) checks for this marker
        # and returns "Successfully Applied Chat Template" if found.
144
145
        # Uses SERVE_TEST_DIR (not sglang_dir) because template_verifier.sh/.py
        # are test-specific mock scripts in tests/serve/launch/
146
        name="template_verification",
147
        directory=SERVE_TEST_DIR,  # special directory for test-specific scripts
148
        script_name="template_verifier.sh",
149
150
151
152
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.nightly,
153
            pytest.mark.timeout(240),  # 3x measured time (20s) + download time (180s)
154
        ],
155
156
        model="Qwen/Qwen3-0.6B",
        env={},
157
        frontend_port=DefaultPort.FRONTEND.value,
158
159
160
161
162
163
        request_payloads=[
            chat_payload_default(
                expected_response=["Successfully Applied Chat Template"]
            )
        ],
    ),
164
165
166
    "multimodal_epd_qwen": SGLangConfig(
        # E/PD architecture: Encode worker (GPU 0) + Prefill/Decode worker (GPU 1)
        name="multimodal_epd_qwen",
167
        directory=sglang_dir,
168
        script_name="multimodal_epd.sh",
169
        marks=[pytest.mark.gpu_2, pytest.mark.nightly],
170
171
172
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        delayed_start=0,
        timeout=360,
173
        frontend_port=DefaultPort.FRONTEND.value,
174
175
176
177
178
179
180
181
182
183
184
185
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
186
187
188
                # NOTE: The response text may mention 'bus', 'train', 'streetcar', etc.
                # so we need something consistently found in the response, or a different
                # approach to validation for this test to be stable.
189
                expected_response=["image"],
190
191
192
193
                temperature=0.0,
            )
        ],
    ),
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
    "multimodal_agg_qwen": SGLangConfig(
        # Tests single-process aggregated multimodal inference using DecodeWorkerHandler
        # with in-process vision encoding (no separate encode worker)
        name="multimodal_agg_qwen",
        directory=sglang_dir,
        script_name="agg.sh",
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.nightly,
            pytest.mark.timeout(300),
        ],
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        script_args=[
            "--model-path",
            "Qwen/Qwen2.5-VL-7B-Instruct",
            "--chat-template",
            "qwen2-vl",
        ],
        delayed_start=0,
        timeout=360,
        frontend_port=DefaultPort.FRONTEND.value,
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["image"],
                temperature=0.0,
                max_tokens=100,
            )
        ],
    ),
234
235
236
237
    "embedding_agg": SGLangConfig(
        name="embedding_agg",
        directory=sglang_dir,
        script_name="agg_embed.sh",
238
239
240
241
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.nightly,
242
            pytest.mark.timeout(270),  # 3x measured time (29s) + download time (180s)
243
        ],
244
245
        model="Qwen/Qwen3-Embedding-4B",
        delayed_start=0,
246
        frontend_port=DefaultPort.FRONTEND.value,
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
        request_payloads=[
            # Test default payload with multiple inputs
            embedding_payload_default(
                repeat_count=2,
                expected_response=["Generated 2 embeddings with dimension"],
            ),
            # Test single string input
            embedding_payload(
                input_text="Hello, world!",
                repeat_count=1,
                expected_response=["Generated 1 embeddings with dimension"],
            ),
            # Test multiple string inputs
            embedding_payload(
                input_text=[
                    "The quick brown fox jumps over the lazy dog.",
                    "Machine learning is transforming technology.",
                    "Natural language processing enables computers to understand text.",
                ],
                repeat_count=1,
                expected_response=["Generated 3 embeddings with dimension"],
            ),
        ],
    ),
271
272
273
274
    "completions_only": SGLangConfig(
        name="completions_only",
        directory=sglang_dir,
        script_name="agg.sh",
275
276
        marks=[
            pytest.mark.gpu_1,
277
            pytest.mark.post_merge,
278
            pytest.mark.timeout(
279
280
                420
            ),  # Total test timeout: 2x measured average (79.36s) + download time (240s) for 7B model
281
        ],
282
283
284
285
286
287
288
289
290
291
292
        model="deepseek-ai/deepseek-llm-7b-base",
        script_args=[
            "--model-path",
            "deepseek-ai/deepseek-llm-7b-base",
            "--dyn-endpoint-types",
            "completions",
        ],
        request_payloads=[
            completion_payload_default(),
        ],
    ),
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
    "anthropic_messages": SGLangConfig(
        name="anthropic_messages",
        directory=sglang_dir,
        script_name="agg.sh",
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.post_merge,
            pytest.mark.timeout(240),
        ],
        model="Qwen/Qwen3-0.6B",
        env={"DYN_ENABLE_ANTHROPIC_API": "1"},
        frontend_port=DefaultPort.FRONTEND.value,
        request_payloads=[
            anthropic_messages_payload_default(),
            anthropic_messages_stream_payload_default(),
        ],
    ),
310
311
312
}


Alec's avatar
Alec committed
313
@pytest.fixture(params=params_with_model_mark(sglang_configs))
314
315
316
317
318
319
320
def sglang_config_test(request):
    """Fixture that provides different SGLang test configurations"""
    return sglang_configs[request.param]


@pytest.mark.e2e
@pytest.mark.sglang
321
322
323
# Use 2 system ports because some `sglang_configs` validate metrics on multiple ports.
# This test iterates over all configs via `sglang_config_test`.
@pytest.mark.parametrize("num_system_ports", [2], indirect=True)
Alec's avatar
Alec committed
324
def test_sglang_deployment(
325
326
327
328
    sglang_config_test,
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
329
    num_system_ports,
330
    predownload_models,
Alec's avatar
Alec committed
331
):
332
    """Test SGLang deployment scenarios using common helpers"""
333
334
335
    assert (
        num_system_ports >= 2
    ), "serve tests require at least SYSTEM_PORT1 + SYSTEM_PORT2"
336
337
338
339
    config = dataclasses.replace(
        sglang_config_test, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
340
341


342
343
@pytest.mark.e2e
@pytest.mark.sglang
344
@pytest.mark.gpu_2
345
@pytest.mark.nightly
346
347
348
@pytest.mark.skip(
    reason="Requires 4 GPUs - enable when hardware is consistently available"
)
349
350
351
def test_sglang_disagg_dp_attention(
    request, runtime_services_dynamic_ports, dynamo_dynamic_ports, predownload_models
):
352
353
    """Test sglang disaggregated with DP attention (requires 4 GPUs)"""

354
    # Kept for reference; this test uses a different launch path and is skipped