test_sglang.py 7.47 KB
Newer Older
1
2
3
4
5
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import logging
import os
6
from dataclasses import dataclass, field
7
8
9

import pytest

Alec's avatar
Alec committed
10
11
from tests.serve.common import (
    SERVE_TEST_DIR,
12
    WORKSPACE_DIR,
Alec's avatar
Alec committed
13
14
15
    params_with_model_mark,
    run_serve_deployment,
)
16
from tests.utils.engine_process import EngineConfig
17
18
19
20
from tests.utils.payload_builder import (
    chat_payload,
    chat_payload_default,
    completion_payload_default,
21
22
    embedding_payload,
    embedding_payload_default,
23
    metric_payload_default,
24
)
25
26
27
28
29

logger = logging.getLogger(__name__)


@dataclass
30
class SGLangConfig(EngineConfig):
31
32
    """Configuration for SGLang test scenarios"""

33
    stragglers: list[str] = field(default_factory=lambda: ["SGLANG:EngineCore"])
34
35


36
sglang_dir = os.environ.get("SGLANG_DIR") or os.path.join(
37
    WORKSPACE_DIR, "examples/backends/sglang"
38
)
39
40
41

sglang_configs = {
    "aggregated": SGLangConfig(
42
43
        # Uses backend agg.sh (with metrics enabled) for testing standard
        # aggregated deployment with metrics collection
44
        name="aggregated",
45
46
        directory=sglang_dir,
        script_name="agg.sh",
47
        marks=[pytest.mark.gpu_1],
48
        model="Qwen/Qwen3-0.6B",
49
50
        env={},
        models_port=8000,
51
52
53
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
54
            metric_payload_default(min_num_requests=6, backend="sglang"),
55
        ],
56
57
    ),
    "disaggregated": SGLangConfig(
58
59
60
61
62
63
64
        name="disaggregated",
        directory=sglang_dir,
        script_name="disagg.sh",
        marks=[pytest.mark.gpu_2],
        model="Qwen/Qwen3-0.6B",
        env={},
        models_port=8000,
65
66
67
68
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
69
    ),
70
71
72
73
74
75
    "disaggregated_same_gpu": SGLangConfig(
        # Uses disagg_same_gpu.sh for single-GPU disaggregated testing
        # Validates metrics from both prefill (port 8081) and decode (port 8082) workers
        name="disaggregated_same_gpu",
        directory=sglang_dir,
        script_name="disagg_same_gpu.sh",
76
        marks=[pytest.mark.gpu_1, pytest.mark.skip(reason="unstable")],
77
78
79
80
81
82
83
84
85
86
87
88
        model="Qwen/Qwen3-0.6B",
        env={},
        models_port=8000,
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
            # Validate dynamo_component_* and sglang:* metrics from prefill worker (port 8081)
            metric_payload_default(min_num_requests=6, backend="sglang", port=8081),
            # Validate dynamo_component_* and sglang:* metrics from decode worker (port 8082)
            metric_payload_default(min_num_requests=6, backend="sglang", port=8082),
        ],
    ),
89
    "kv_events": SGLangConfig(
90
91
92
93
94
95
96
97
98
99
100
101
        name="kv_events",
        directory=sglang_dir,
        script_name="agg_router.sh",
        marks=[pytest.mark.gpu_2],
        model="Qwen/Qwen3-0.6B",
        env={
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info",
        },
        models_port=8000,
        request_payloads=[
            chat_payload_default(
                expected_log=[
102
                    r"ZMQ listener .* received batch with \d+ events \(seq=\d+(?:, [^)]*)?\)",
103
                    r"Event processor for worker_id \d+ processing event: Stored\(",
Yan Ru Pei's avatar
Yan Ru Pei committed
104
                    r"Selected worker: worker_id=\d+ dp_rank=.*?, logit: ",
105
106
107
                ]
            )
        ],
108
    ),
109
110
111
112
113
    "template_verification": SGLangConfig(
        # Tests custom jinja template preprocessing by verifying the template
        # marker 'CUSTOM_TEMPLATE_ACTIVE|' is applied to user messages.
        # The backend (launch/template_verifier.*) checks for this marker
        # and returns "Successfully Applied Chat Template" if found.
114
115
        # Uses SERVE_TEST_DIR (not sglang_dir) because template_verifier.sh/.py
        # are test-specific mock scripts in tests/serve/launch/
116
        name="template_verification",
117
        directory=SERVE_TEST_DIR,  # special directory for test-specific scripts
118
119
120
121
122
123
124
125
126
127
128
        script_name="template_verifier.sh",
        marks=[pytest.mark.gpu_1],
        model="Qwen/Qwen3-0.6B",
        env={},
        models_port=8000,
        request_payloads=[
            chat_payload_default(
                expected_response=["Successfully Applied Chat Template"]
            )
        ],
    ),
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
    "multimodal_agg_qwen": SGLangConfig(
        name="multimodal_agg_qwen",
        directory=sglang_dir,
        script_name="multimodal_agg.sh",
        marks=[pytest.mark.gpu_2],
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        delayed_start=0,
        timeout=360,
        models_port=8000,
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
150
151
152
                # NOTE: The response text may mention 'bus', 'train', 'streetcar', etc.
                # so we need something consistently found in the response, or a different
                # approach to validation for this test to be stable.
153
                expected_response=["image"],
154
155
156
157
                temperature=0.0,
            )
        ],
    ),
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
    "embedding_agg": SGLangConfig(
        name="embedding_agg",
        directory=sglang_dir,
        script_name="agg_embed.sh",
        marks=[pytest.mark.gpu_1],
        model="Qwen/Qwen3-Embedding-4B",
        delayed_start=0,
        timeout=180,
        models_port=8000,
        request_payloads=[
            # Test default payload with multiple inputs
            embedding_payload_default(
                repeat_count=2,
                expected_response=["Generated 2 embeddings with dimension"],
            ),
            # Test single string input
            embedding_payload(
                input_text="Hello, world!",
                repeat_count=1,
                expected_response=["Generated 1 embeddings with dimension"],
            ),
            # Test multiple string inputs
            embedding_payload(
                input_text=[
                    "The quick brown fox jumps over the lazy dog.",
                    "Machine learning is transforming technology.",
                    "Natural language processing enables computers to understand text.",
                ],
                repeat_count=1,
                expected_response=["Generated 3 embeddings with dimension"],
            ),
        ],
    ),
191
192
193
}


Alec's avatar
Alec committed
194
@pytest.fixture(params=params_with_model_mark(sglang_configs))
195
196
197
198
199
200
201
def sglang_config_test(request):
    """Fixture that provides different SGLang test configurations"""
    return sglang_configs[request.param]


@pytest.mark.e2e
@pytest.mark.sglang
Alec's avatar
Alec committed
202
203
204
def test_sglang_deployment(
    sglang_config_test, request, runtime_services, predownload_models
):
205
    """Test SGLang deployment scenarios using common helpers"""
206
    config = sglang_config_test
207
    run_serve_deployment(config, request)
208
209


210
211
212
@pytest.mark.skip(
    reason="Requires 4 GPUs - enable when hardware is consistently available"
)
Alec's avatar
Alec committed
213
def test_sglang_disagg_dp_attention(request, runtime_services, predownload_models):
214
215
    """Test sglang disaggregated with DP attention (requires 4 GPUs)"""

216
    # Kept for reference; this test uses a different launch path and is skipped