test_sglang.py 6.18 KB
Newer Older
1
2
3
4
5
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import logging
import os
6
from dataclasses import dataclass, field
7
8
9

import pytest

Alec's avatar
Alec committed
10
11
12
13
14
from tests.serve.common import (
    SERVE_TEST_DIR,
    params_with_model_mark,
    run_serve_deployment,
)
15
from tests.utils.engine_process import EngineConfig
16
17
18
19
from tests.utils.payload_builder import (
    chat_payload,
    chat_payload_default,
    completion_payload_default,
20
21
    embedding_payload,
    embedding_payload_default,
22
)
23
24
25
26
27

logger = logging.getLogger(__name__)


@dataclass
28
class SGLangConfig(EngineConfig):
29
30
    """Configuration for SGLang test scenarios"""

31
    stragglers: list[str] = field(default_factory=lambda: ["SGLANG:EngineCore"])
32
33


34
sglang_dir = os.environ.get("SGLANG_DIR", "/workspace/components/backends/sglang")
35
36
37

sglang_configs = {
    "aggregated": SGLangConfig(
38
        name="aggregated",
Alec's avatar
Alec committed
39
        directory=SERVE_TEST_DIR,
40
        script_name="sglang_agg.sh",
41
        marks=[pytest.mark.gpu_1],
42
        model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
43
44
        env={},
        models_port=8000,
45
46
47
48
49
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
            # TODO: Add metric_payload_default(min_num_requests=N, backend="sglang")
        ],
50
51
    ),
    "disaggregated": SGLangConfig(
52
53
54
55
56
57
58
59
        name="disaggregated",
        directory=sglang_dir,
        script_name="disagg.sh",
        marks=[pytest.mark.gpu_2],
        model="Qwen/Qwen3-0.6B",
        env={},
        models_port=8000,
        request_payloads=[chat_payload_default(), completion_payload_default()],
60
    ),
61
    "kv_events": SGLangConfig(
62
63
64
65
66
67
68
69
70
71
72
73
        name="kv_events",
        directory=sglang_dir,
        script_name="agg_router.sh",
        marks=[pytest.mark.gpu_2],
        model="Qwen/Qwen3-0.6B",
        env={
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info",
        },
        models_port=8000,
        request_payloads=[
            chat_payload_default(
                expected_log=[
74
                    r"ZMQ listener .* received batch with \d+ events \(seq=\d+(?:, [^)]*)?\)",
75
                    r"Event processor for worker_id \d+ processing event: Stored\(",
Yan Ru Pei's avatar
Yan Ru Pei committed
76
                    r"Selected worker: worker_id=\d+ dp_rank=.*?, logit: ",
77
78
79
                ]
            )
        ],
80
    ),
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
    "template_verification": SGLangConfig(
        # Tests custom jinja template preprocessing by verifying the template
        # marker 'CUSTOM_TEMPLATE_ACTIVE|' is applied to user messages.
        # The backend (launch/template_verifier.*) checks for this marker
        # and returns "Successfully Applied Chat Template" if found.
        name="template_verification",
        directory=SERVE_TEST_DIR,
        script_name="template_verifier.sh",
        marks=[pytest.mark.gpu_1],
        model="Qwen/Qwen3-0.6B",
        env={},
        models_port=8000,
        request_payloads=[
            chat_payload_default(
                expected_response=["Successfully Applied Chat Template"]
            )
        ],
    ),
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
    "multimodal_agg_qwen": SGLangConfig(
        name="multimodal_agg_qwen",
        directory=sglang_dir,
        script_name="multimodal_agg.sh",
        marks=[pytest.mark.gpu_2],
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        delayed_start=0,
        timeout=360,
        models_port=8000,
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
120
121
122
                # NOTE: The response text may mention 'bus', 'train', 'streetcar', etc.
                # so we need something consistently found in the response, or a different
                # approach to validation for this test to be stable.
123
                expected_response=["image"],
124
125
126
127
                temperature=0.0,
            )
        ],
    ),
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
    "embedding_agg": SGLangConfig(
        name="embedding_agg",
        directory=sglang_dir,
        script_name="agg_embed.sh",
        marks=[pytest.mark.gpu_1],
        model="Qwen/Qwen3-Embedding-4B",
        delayed_start=0,
        timeout=180,
        models_port=8000,
        request_payloads=[
            # Test default payload with multiple inputs
            embedding_payload_default(
                repeat_count=2,
                expected_response=["Generated 2 embeddings with dimension"],
            ),
            # Test single string input
            embedding_payload(
                input_text="Hello, world!",
                repeat_count=1,
                expected_response=["Generated 1 embeddings with dimension"],
            ),
            # Test multiple string inputs
            embedding_payload(
                input_text=[
                    "The quick brown fox jumps over the lazy dog.",
                    "Machine learning is transforming technology.",
                    "Natural language processing enables computers to understand text.",
                ],
                repeat_count=1,
                expected_response=["Generated 3 embeddings with dimension"],
            ),
        ],
    ),
161
162
163
}


Alec's avatar
Alec committed
164
@pytest.fixture(params=params_with_model_mark(sglang_configs))
165
166
167
168
169
170
171
def sglang_config_test(request):
    """Fixture that provides different SGLang test configurations"""
    return sglang_configs[request.param]


@pytest.mark.e2e
@pytest.mark.sglang
Alec's avatar
Alec committed
172
173
174
def test_sglang_deployment(
    sglang_config_test, request, runtime_services, predownload_models
):
175
    """Test SGLang deployment scenarios using common helpers"""
176
    config = sglang_config_test
177
    run_serve_deployment(config, request)
178
179


180
181
182
@pytest.mark.skip(
    reason="Requires 4 GPUs - enable when hardware is consistently available"
)
Alec's avatar
Alec committed
183
def test_sglang_disagg_dp_attention(request, runtime_services, predownload_models):
184
185
    """Test sglang disaggregated with DP attention (requires 4 GPUs)"""

186
    # Kept for reference; this test uses a different launch path and is skipped