test_sglang.py 7.65 KB
Newer Older
1
2
3
4
5
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import logging
import os
6
from dataclasses import dataclass, field
7
8
9

import pytest

Alec's avatar
Alec committed
10
11
from tests.serve.common import (
    SERVE_TEST_DIR,
12
    WORKSPACE_DIR,
Alec's avatar
Alec committed
13
14
15
    params_with_model_mark,
    run_serve_deployment,
)
16
from tests.utils.engine_process import EngineConfig
17
18
19
20
from tests.utils.payload_builder import (
    chat_payload,
    chat_payload_default,
    completion_payload_default,
21
22
    embedding_payload,
    embedding_payload_default,
23
    metric_payload_default,
24
)
25
26
27
28
29

logger = logging.getLogger(__name__)


@dataclass
30
class SGLangConfig(EngineConfig):
31
32
    """Configuration for SGLang test scenarios"""

33
    stragglers: list[str] = field(default_factory=lambda: ["SGLANG:EngineCore"])
34
35


36
sglang_dir = os.environ.get("SGLANG_DIR") or os.path.join(
37
    WORKSPACE_DIR, "examples/backends/sglang"
38
)
39
40
41

sglang_configs = {
    "aggregated": SGLangConfig(
42
43
        # Uses backend agg.sh (with metrics enabled) for testing standard
        # aggregated deployment with metrics collection
44
        name="aggregated",
45
46
        directory=sglang_dir,
        script_name="agg.sh",
47
        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
48
        model="Qwen/Qwen3-0.6B",
49
50
        env={},
        models_port=8000,
51
52
53
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
54
            metric_payload_default(min_num_requests=6, backend="sglang"),
55
        ],
56
57
    ),
    "disaggregated": SGLangConfig(
58
59
60
        name="disaggregated",
        directory=sglang_dir,
        script_name="disagg.sh",
61
        marks=[pytest.mark.gpu_2, pytest.mark.post_merge],
62
63
64
        model="Qwen/Qwen3-0.6B",
        env={},
        models_port=8000,
65
66
67
68
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
69
    ),
70
71
72
73
74
75
    "disaggregated_same_gpu": SGLangConfig(
        # Uses disagg_same_gpu.sh for single-GPU disaggregated testing
        # Validates metrics from both prefill (port 8081) and decode (port 8082) workers
        name="disaggregated_same_gpu",
        directory=sglang_dir,
        script_name="disagg_same_gpu.sh",
76
        marks=[pytest.mark.gpu_1, pytest.mark.skip(reason="unstable")],
77
78
79
80
81
82
83
84
85
86
87
88
        model="Qwen/Qwen3-0.6B",
        env={},
        models_port=8000,
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
            # Validate dynamo_component_* and sglang:* metrics from prefill worker (port 8081)
            metric_payload_default(min_num_requests=6, backend="sglang", port=8081),
            # Validate dynamo_component_* and sglang:* metrics from decode worker (port 8082)
            metric_payload_default(min_num_requests=6, backend="sglang", port=8082),
        ],
    ),
89
    "kv_events": SGLangConfig(
90
91
92
93
94
95
96
97
98
99
100
101
        name="kv_events",
        directory=sglang_dir,
        script_name="agg_router.sh",
        marks=[pytest.mark.gpu_2],
        model="Qwen/Qwen3-0.6B",
        env={
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info",
        },
        models_port=8000,
        request_payloads=[
            chat_payload_default(
                expected_log=[
102
                    r"ZMQ listener .* received batch with \d+ events \(seq=\d+(?:, [^)]*)?\)",
103
                    r"Event processor for worker_id \d+ processing event: Stored\(",
Yan Ru Pei's avatar
Yan Ru Pei committed
104
                    r"Selected worker: worker_id=\d+ dp_rank=.*?, logit: ",
105
106
107
                ]
            )
        ],
108
    ),
109
110
111
112
113
    "template_verification": SGLangConfig(
        # Tests custom jinja template preprocessing by verifying the template
        # marker 'CUSTOM_TEMPLATE_ACTIVE|' is applied to user messages.
        # The backend (launch/template_verifier.*) checks for this marker
        # and returns "Successfully Applied Chat Template" if found.
114
115
        # Uses SERVE_TEST_DIR (not sglang_dir) because template_verifier.sh/.py
        # are test-specific mock scripts in tests/serve/launch/
116
        name="template_verification",
117
        directory=SERVE_TEST_DIR,  # special directory for test-specific scripts
118
        script_name="template_verifier.sh",
119
        marks=[pytest.mark.gpu_1, pytest.mark.nightly],
120
121
122
123
124
125
126
127
128
        model="Qwen/Qwen3-0.6B",
        env={},
        models_port=8000,
        request_payloads=[
            chat_payload_default(
                expected_response=["Successfully Applied Chat Template"]
            )
        ],
    ),
129
130
131
132
    "multimodal_agg_qwen": SGLangConfig(
        name="multimodal_agg_qwen",
        directory=sglang_dir,
        script_name="multimodal_agg.sh",
133
        marks=[pytest.mark.gpu_2, pytest.mark.nightly],
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        delayed_start=0,
        timeout=360,
        models_port=8000,
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
150
151
152
                # NOTE: The response text may mention 'bus', 'train', 'streetcar', etc.
                # so we need something consistently found in the response, or a different
                # approach to validation for this test to be stable.
153
                expected_response=["image"],
154
155
156
157
                temperature=0.0,
            )
        ],
    ),
158
159
160
161
    "embedding_agg": SGLangConfig(
        name="embedding_agg",
        directory=sglang_dir,
        script_name="agg_embed.sh",
162
        marks=[pytest.mark.gpu_1, pytest.mark.nightly],
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
        model="Qwen/Qwen3-Embedding-4B",
        delayed_start=0,
        timeout=180,
        models_port=8000,
        request_payloads=[
            # Test default payload with multiple inputs
            embedding_payload_default(
                repeat_count=2,
                expected_response=["Generated 2 embeddings with dimension"],
            ),
            # Test single string input
            embedding_payload(
                input_text="Hello, world!",
                repeat_count=1,
                expected_response=["Generated 1 embeddings with dimension"],
            ),
            # Test multiple string inputs
            embedding_payload(
                input_text=[
                    "The quick brown fox jumps over the lazy dog.",
                    "Machine learning is transforming technology.",
                    "Natural language processing enables computers to understand text.",
                ],
                repeat_count=1,
                expected_response=["Generated 3 embeddings with dimension"],
            ),
        ],
    ),
191
192
193
}


Alec's avatar
Alec committed
194
@pytest.fixture(params=params_with_model_mark(sglang_configs))
195
196
197
198
199
200
201
def sglang_config_test(request):
    """Fixture that provides different SGLang test configurations"""
    return sglang_configs[request.param]


@pytest.mark.e2e
@pytest.mark.sglang
Alec's avatar
Alec committed
202
203
204
def test_sglang_deployment(
    sglang_config_test, request, runtime_services, predownload_models
):
205
    """Test SGLang deployment scenarios using common helpers"""
206
    config = sglang_config_test
207
    run_serve_deployment(config, request)
208
209


210
211
212
213
@pytest.mark.e2e
@pytest.mark.sglang
@pytest.mark.gpu_1
@pytest.mark.nightly
214
215
216
@pytest.mark.skip(
    reason="Requires 4 GPUs - enable when hardware is consistently available"
)
Alec's avatar
Alec committed
217
def test_sglang_disagg_dp_attention(request, runtime_services, predownload_models):
218
219
    """Test sglang disaggregated with DP attention (requires 4 GPUs)"""

220
    # Kept for reference; this test uses a different launch path and is skipped