test_sglang.py 9.96 KB
Newer Older
1
2
3
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

4
import dataclasses
5
6
import logging
import os
7
from dataclasses import dataclass, field
8
9
10

import pytest

Alec's avatar
Alec committed
11
12
from tests.serve.common import (
    SERVE_TEST_DIR,
13
    WORKSPACE_DIR,
Alec's avatar
Alec committed
14
15
16
    params_with_model_mark,
    run_serve_deployment,
)
17
from tests.utils.constants import DefaultPort
18
from tests.utils.engine_process import EngineConfig
19
20
21
22
from tests.utils.payload_builder import (
    chat_payload,
    chat_payload_default,
    completion_payload_default,
23
24
    embedding_payload,
    embedding_payload_default,
25
    metric_payload_default,
26
)
27
28
29
30
31

logger = logging.getLogger(__name__)


@dataclass
32
class SGLangConfig(EngineConfig):
33
34
    """Configuration for SGLang test scenarios"""

35
    stragglers: list[str] = field(default_factory=lambda: ["SGLANG:EngineCore"])
36
37


38
sglang_dir = os.environ.get("SGLANG_DIR") or os.path.join(
39
    WORKSPACE_DIR, "examples/backends/sglang"
40
)
41

42
43
44
# SGLang test configurations
# NOTE: pytest.mark.gpu_1 tests take ~167s (2m 47s) total to run sequentially (with models pre-cached)
# TODO: Parallelize these tests to reduce total execution time
45
46
sglang_configs = {
    "aggregated": SGLangConfig(
47
48
        # Uses backend agg.sh (with metrics enabled) for testing standard
        # aggregated deployment with metrics collection
49
        name="aggregated",
50
51
        directory=sglang_dir,
        script_name="agg.sh",
52
53
54
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
55
            pytest.mark.timeout(240),  # 3x measured time (39s) + download time (120s)
56
        ],
57
        model="Qwen/Qwen3-0.6B",
58
        env={},
59
        frontend_port=DefaultPort.FRONTEND.value,
60
61
62
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
63
            metric_payload_default(min_num_requests=6, backend="sglang"),
64
        ],
65
66
    ),
    "disaggregated": SGLangConfig(
67
68
69
        name="disaggregated",
        directory=sglang_dir,
        script_name="disagg.sh",
70
        marks=[pytest.mark.gpu_2, pytest.mark.post_merge],
71
72
        model="Qwen/Qwen3-0.6B",
        env={},
73
        frontend_port=DefaultPort.FRONTEND.value,
74
75
76
77
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
78
    ),
79
80
    "disaggregated_same_gpu": SGLangConfig(
        # Uses disagg_same_gpu.sh for single-GPU disaggregated testing
81
82
        # Validates metrics from both prefill (DefaultPort.SYSTEM1) and decode
        # (DefaultPort.SYSTEM2) workers
83
84
85
        name="disaggregated_same_gpu",
        directory=sglang_dir,
        script_name="disagg_same_gpu.sh",
86
87
88
89
90
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.skip(reason="unstable"),
        ],
91
92
        model="Qwen/Qwen3-0.6B",
        env={},
93
        frontend_port=DefaultPort.FRONTEND.value,
94
95
96
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
97
98
99
100
101
102
103
104
105
106
107
108
109
110
            # Validate dynamo_component_* and sglang:* metrics from prefill worker
            # (DefaultPort.SYSTEM1)
            metric_payload_default(
                min_num_requests=6,
                backend="sglang",
                port=DefaultPort.SYSTEM1.value,
            ),
            # Validate dynamo_component_* and sglang:* metrics from decode worker
            # (DefaultPort.SYSTEM2)
            metric_payload_default(
                min_num_requests=6,
                backend="sglang",
                port=DefaultPort.SYSTEM2.value,
            ),
111
112
        ],
    ),
113
    "kv_events": SGLangConfig(
114
115
116
117
118
119
120
121
        name="kv_events",
        directory=sglang_dir,
        script_name="agg_router.sh",
        marks=[pytest.mark.gpu_2],
        model="Qwen/Qwen3-0.6B",
        env={
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info",
        },
122
        frontend_port=DefaultPort.FRONTEND.value,
123
124
125
        request_payloads=[
            chat_payload_default(
                expected_log=[
126
                    r"ZMQ listener .* received batch with \d+ events \(seq=\d+(?:, [^)]*)?\)",
127
                    r"Event processor for worker_id \d+ processing event: Stored\(",
Yan Ru Pei's avatar
Yan Ru Pei committed
128
                    r"Selected worker: worker_id=\d+ dp_rank=.*?, logit: ",
129
130
131
                ]
            )
        ],
132
    ),
133
134
135
136
137
    "template_verification": SGLangConfig(
        # Tests custom jinja template preprocessing by verifying the template
        # marker 'CUSTOM_TEMPLATE_ACTIVE|' is applied to user messages.
        # The backend (launch/template_verifier.*) checks for this marker
        # and returns "Successfully Applied Chat Template" if found.
138
139
        # Uses SERVE_TEST_DIR (not sglang_dir) because template_verifier.sh/.py
        # are test-specific mock scripts in tests/serve/launch/
140
        name="template_verification",
141
        directory=SERVE_TEST_DIR,  # special directory for test-specific scripts
142
        script_name="template_verifier.sh",
143
144
145
146
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.nightly,
147
            pytest.mark.timeout(240),  # 3x measured time (20s) + download time (180s)
148
        ],
149
150
        model="Qwen/Qwen3-0.6B",
        env={},
151
        frontend_port=DefaultPort.FRONTEND.value,
152
153
154
155
156
157
        request_payloads=[
            chat_payload_default(
                expected_response=["Successfully Applied Chat Template"]
            )
        ],
    ),
158
159
160
161
    "multimodal_agg_qwen": SGLangConfig(
        name="multimodal_agg_qwen",
        directory=sglang_dir,
        script_name="multimodal_agg.sh",
162
        marks=[pytest.mark.gpu_2, pytest.mark.nightly],
163
164
165
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        delayed_start=0,
        timeout=360,
166
        frontend_port=DefaultPort.FRONTEND.value,
167
168
169
170
171
172
173
174
175
176
177
178
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
179
180
181
                # NOTE: The response text may mention 'bus', 'train', 'streetcar', etc.
                # so we need something consistently found in the response, or a different
                # approach to validation for this test to be stable.
182
                expected_response=["image"],
183
184
185
186
                temperature=0.0,
            )
        ],
    ),
187
188
189
190
    "embedding_agg": SGLangConfig(
        name="embedding_agg",
        directory=sglang_dir,
        script_name="agg_embed.sh",
191
192
193
194
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.nightly,
195
            pytest.mark.timeout(270),  # 3x measured time (29s) + download time (180s)
196
        ],
197
198
        model="Qwen/Qwen3-Embedding-4B",
        delayed_start=0,
199
        frontend_port=DefaultPort.FRONTEND.value,
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
        request_payloads=[
            # Test default payload with multiple inputs
            embedding_payload_default(
                repeat_count=2,
                expected_response=["Generated 2 embeddings with dimension"],
            ),
            # Test single string input
            embedding_payload(
                input_text="Hello, world!",
                repeat_count=1,
                expected_response=["Generated 1 embeddings with dimension"],
            ),
            # Test multiple string inputs
            embedding_payload(
                input_text=[
                    "The quick brown fox jumps over the lazy dog.",
                    "Machine learning is transforming technology.",
                    "Natural language processing enables computers to understand text.",
                ],
                repeat_count=1,
                expected_response=["Generated 3 embeddings with dimension"],
            ),
        ],
    ),
224
225
226
227
    "completions_only": SGLangConfig(
        name="completions_only",
        directory=sglang_dir,
        script_name="agg.sh",
228
229
230
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.timeout(
231
232
                420
            ),  # Total test timeout: 2x measured average (79.36s) + download time (240s) for 7B model
233
        ],
234
235
236
237
238
239
240
241
242
243
244
        model="deepseek-ai/deepseek-llm-7b-base",
        script_args=[
            "--model-path",
            "deepseek-ai/deepseek-llm-7b-base",
            "--dyn-endpoint-types",
            "completions",
        ],
        request_payloads=[
            completion_payload_default(),
        ],
    ),
245
246
247
}


Alec's avatar
Alec committed
248
@pytest.fixture(params=params_with_model_mark(sglang_configs))
249
250
251
252
253
254
255
def sglang_config_test(request):
    """Fixture that provides different SGLang test configurations"""
    return sglang_configs[request.param]


@pytest.mark.e2e
@pytest.mark.sglang
256
257
258
# Use 2 system ports because some `sglang_configs` validate metrics on multiple ports.
# This test iterates over all configs via `sglang_config_test`.
@pytest.mark.parametrize("num_system_ports", [2], indirect=True)
Alec's avatar
Alec committed
259
def test_sglang_deployment(
260
261
262
263
    sglang_config_test,
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
264
    num_system_ports,
265
    predownload_models,
Alec's avatar
Alec committed
266
):
267
    """Test SGLang deployment scenarios using common helpers"""
268
269
270
    assert (
        num_system_ports >= 2
    ), "serve tests require at least SYSTEM_PORT1 + SYSTEM_PORT2"
271
272
273
274
    config = dataclasses.replace(
        sglang_config_test, frontend_port=dynamo_dynamic_ports.frontend_port
    )
    run_serve_deployment(config, request, ports=dynamo_dynamic_ports)
275
276


277
278
279
280
@pytest.mark.e2e
@pytest.mark.sglang
@pytest.mark.gpu_1
@pytest.mark.nightly
281
282
283
@pytest.mark.skip(
    reason="Requires 4 GPUs - enable when hardware is consistently available"
)
284
285
286
def test_sglang_disagg_dp_attention(
    request, runtime_services_dynamic_ports, dynamo_dynamic_ports, predownload_models
):
287
288
    """Test sglang disaggregated with DP attention (requires 4 GPUs)"""

289
    # Kept for reference; this test uses a different launch path and is skipped