test_vllm.py 8.77 KB
Newer Older
1
2
3
4
5
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import logging
import os
6
from dataclasses import dataclass, field
7
8
9

import pytest

10
11
12
13
14
from tests.serve.common import (
    WORKSPACE_DIR,
    params_with_model_mark,
    run_serve_deployment,
)
15
16
17
18
19
20
from tests.utils.engine_process import EngineConfig
from tests.utils.payload_builder import (
    chat_payload,
    chat_payload_default,
    completion_payload_default,
    metric_payload_default,
21
22
23
24
25
26
)

logger = logging.getLogger(__name__)


@dataclass
27
class VLLMConfig(EngineConfig):
28
29
    """Configuration for vLLM test scenarios"""

30
    stragglers: list[str] = field(default_factory=lambda: ["VLLM:EngineCore"])
31
32


33
vllm_dir = os.environ.get("VLLM_DIR") or os.path.join(
34
    WORKSPACE_DIR, "examples/backends/vllm"
35
)
36
37
38
39
40

# vLLM test configurations
vllm_configs = {
    "aggregated": VLLMConfig(
        name="aggregated",
41
        directory=vllm_dir,
42
        script_name="agg.sh",
43
        marks=[pytest.mark.gpu_1],
44
        model="Qwen/Qwen3-0.6B",
45
46
47
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
48
            metric_payload_default(min_num_requests=6, backend="vllm"),
49
        ],
50
    ),
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
    "agg-request-plane-tcp": VLLMConfig(
        name="agg-request-plane-tcp",
        directory=vllm_dir,
        script_name="agg_request_planes.sh",
        marks=[pytest.mark.gpu_1],
        model="Qwen/Qwen3-0.6B",
        script_args=["--tcp"],
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
    ),
    "agg-request-plane-http": VLLMConfig(
        name="agg-request-plane-http",
        directory=vllm_dir,
        script_name="agg_request_planes.sh",
        marks=[pytest.mark.gpu_1],
        model="Qwen/Qwen3-0.6B",
        script_args=["--http"],
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
    ),
75
76
    "agg-router": VLLMConfig(
        name="agg-router",
77
        directory=vllm_dir,
78
        script_name="agg_router.sh",
79
        marks=[pytest.mark.gpu_2],
80
        model="Qwen/Qwen3-0.6B",
81
82
83
        request_payloads=[
            chat_payload_default(
                expected_log=[
84
                    r"ZMQ listener .* received batch with \d+ events \(seq=\d+(?:, [^)]*)?\)",
85
                    r"Event processor for worker_id \d+ processing event: Stored\(",
Yan Ru Pei's avatar
Yan Ru Pei committed
86
                    r"Selected worker: worker_id=\d+ dp_rank=.*?, logit: ",
87
88
89
90
91
92
                ]
            )
        ],
        env={
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info",
        },
93
    ),
94
95
    "disaggregated": VLLMConfig(
        name="disaggregated",
96
        directory=vllm_dir,
97
        script_name="disagg.sh",
98
        marks=[pytest.mark.gpu_2],
99
        model="Qwen/Qwen3-0.6B",
100
101
102
103
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
104
    ),
105
106
    "deepep": VLLMConfig(
        name="deepep",
107
        directory=vllm_dir,
108
        script_name="dsr1_dep.sh",
109
110
111
112
113
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.vllm,
            pytest.mark.h100,
        ],
114
        model="deepseek-ai/DeepSeek-V2-Lite",
115
        script_args=[
116
117
118
119
120
121
122
123
124
            "--model",
            "deepseek-ai/DeepSeek-V2-Lite",
            "--num-nodes",
            "1",
            "--node-rank",
            "0",
            "--gpus-per-node",
            "2",
        ],
125
        timeout=700,
126
        request_payloads=[
Richard Huo's avatar
Richard Huo committed
127
128
            chat_payload_default(expected_response=["joke"]),
            completion_payload_default(expected_response=["joke"]),
129
        ],
130
    ),
131
132
    "multimodal_agg_llava_epd": VLLMConfig(
        name="multimodal_agg_llava_epd",
133
        directory=vllm_dir,
134
        script_name="agg_multimodal_epd.sh",
135
        marks=[pytest.mark.gpu_2],
136
        model="llava-hf/llava-1.5-7b-hf",
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
        script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["bus"],
                temperature=0.0,
            )
        ],
154
    ),
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
    "multimodal_agg_qwen_epd": VLLMConfig(
        name="multimodal_agg_qwen_epd",
        directory=vllm_dir,
        script_name="agg_multimodal_epd.sh",
        marks=[pytest.mark.gpu_2],
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        delayed_start=0,
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
        timeout=360,
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["bus"],
            )
        ],
    ),
180
181
    "multimodal_agg_qwen": VLLMConfig(
        name="multimodal_agg_qwen",
182
183
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
184
        marks=[pytest.mark.gpu_2],
185
        model="Qwen/Qwen2.5-VL-7B-Instruct",
186
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
187
        delayed_start=0,
188
        timeout=360,
189
        request_payloads=[
190
            # HTTP URL test
191
192
193
194
195
196
197
198
199
200
201
202
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["bus"],
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
            ),
            # Base64 data URL test (1x1 PNG inline, avoids network fetch)
            chat_payload(
                [
                    {"type": "text", "text": "What do you see in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAAAAAA6fptVAAAACklEQVR4nGNoAAAAggCBd81ytgAAAABJRU5ErkJggg=="
                        },
                    },
                ],
                repeat_count=1,
                expected_response=[],  # Just validate no error
            ),
218
        ],
219
    ),
220
    # TODO: Update this test case when we have video multimodal support in vllm official components
221
222
    "multimodal_video_agg": VLLMConfig(
        name="multimodal_video_agg",
223
        directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
224
        script_name="video_agg.sh",
225
        marks=[pytest.mark.gpu_2],
226
227
        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
        delayed_start=0,
228
        script_args=["--model", "llava-hf/LLaVA-NeXT-Video-7B-hf"],
229
        timeout=360,
230
231
232
233
234
235
236
237
238
239
240
241
242
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "Describe the video in detail"},
                    {
                        "type": "video_url",
                        "video_url": {
                            "url": "https://storage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["rabbit"],
243
                temperature=0.7,
244
245
            )
        ],
246
    ),
247
248
249
    # TODO: Enable this test case when we have 4 GPUs runners.
    # "multimodal_disagg": VLLMConfig(
    #     name="multimodal_disagg",
250
    #     directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
251
252
253
254
    #     script_name="disagg.sh",
    #     marks=[pytest.mark.gpu_4, pytest.mark.vllm],
    #     model="llava-hf/llava-1.5-7b-hf",
    #     delayed_start=45,
255
    #     script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
256
    # ),
257
258
259
}


Alec's avatar
Alec committed
260
@pytest.fixture(params=params_with_model_mark(vllm_configs))
261
262
263
264
265
def vllm_config_test(request):
    """Fixture that provides different vLLM test configurations"""
    return vllm_configs[request.param]


266
@pytest.mark.vllm
267
@pytest.mark.e2e
Alec's avatar
Alec committed
268
269
270
def test_serve_deployment(
    vllm_config_test, request, runtime_services, predownload_models
):
271
272
273
274
    """
    Test dynamo serve deployments with different graph configurations.
    """
    config = vllm_config_test
275
    run_serve_deployment(config, request)