test_vllm.py 11.6 KB
Newer Older
1
2
3
4
5
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import logging
import os
6
from dataclasses import dataclass, field
7
8
9

import pytest

10
11
12
13
14
from tests.serve.common import (
    WORKSPACE_DIR,
    params_with_model_mark,
    run_serve_deployment,
)
15
16
17
18
19
20
from tests.utils.engine_process import EngineConfig
from tests.utils.payload_builder import (
    chat_payload,
    chat_payload_default,
    completion_payload_default,
    metric_payload_default,
21
22
23
24
25
26
)

logger = logging.getLogger(__name__)


@dataclass
27
class VLLMConfig(EngineConfig):
28
29
    """Configuration for vLLM test scenarios"""

30
    stragglers: list[str] = field(default_factory=lambda: ["VLLM:EngineCore"])
31
32


33
vllm_dir = os.environ.get("VLLM_DIR") or os.path.join(
34
    WORKSPACE_DIR, "examples/backends/vllm"
35
)
36
37
38
39
40

# vLLM test configurations
vllm_configs = {
    "aggregated": VLLMConfig(
        name="aggregated",
41
        directory=vllm_dir,
42
        script_name="agg.sh",
43
        marks=[pytest.mark.gpu_1],
44
        model="Qwen/Qwen3-0.6B",
45
46
47
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
48
            metric_payload_default(min_num_requests=6, backend="vllm"),
49
        ],
50
    ),
51
52
53
54
55
56
57
58
59
60
61
62
63
    "aggregated_lmcache": VLLMConfig(
        name="aggregated_lmcache",
        directory=vllm_dir,
        script_name="agg_lmcache.sh",
        marks=[pytest.mark.gpu_1],
        model="Qwen/Qwen3-0.6B",
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
            metric_payload_default(min_num_requests=6, backend="vllm"),
            metric_payload_default(min_num_requests=6, backend="lmcache"),
        ],
    ),
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
    "agg-request-plane-tcp": VLLMConfig(
        name="agg-request-plane-tcp",
        directory=vllm_dir,
        script_name="agg_request_planes.sh",
        marks=[pytest.mark.gpu_1],
        model="Qwen/Qwen3-0.6B",
        script_args=["--tcp"],
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
    ),
    "agg-request-plane-http": VLLMConfig(
        name="agg-request-plane-http",
        directory=vllm_dir,
        script_name="agg_request_planes.sh",
        marks=[pytest.mark.gpu_1],
        model="Qwen/Qwen3-0.6B",
        script_args=["--http"],
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
    ),
88
89
    "agg-router": VLLMConfig(
        name="agg-router",
90
        directory=vllm_dir,
91
        script_name="agg_router.sh",
92
        marks=[pytest.mark.gpu_2],
93
        model="Qwen/Qwen3-0.6B",
94
95
96
        request_payloads=[
            chat_payload_default(
                expected_log=[
97
                    r"ZMQ listener .* received batch with \d+ events \(seq=\d+(?:, [^)]*)?\)",
98
                    r"Event processor for worker_id \d+ processing event: Stored\(",
Yan Ru Pei's avatar
Yan Ru Pei committed
99
                    r"Selected worker: worker_id=\d+ dp_rank=.*?, logit: ",
100
101
102
103
104
105
                ]
            )
        ],
        env={
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info",
        },
106
    ),
107
108
    "disaggregated": VLLMConfig(
        name="disaggregated",
109
        directory=vllm_dir,
110
        script_name="disagg.sh",
111
        marks=[pytest.mark.gpu_2],
112
        model="Qwen/Qwen3-0.6B",
113
114
115
116
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
117
    ),
118
119
    "deepep": VLLMConfig(
        name="deepep",
120
        directory=vllm_dir,
121
        script_name="dsr1_dep.sh",
122
123
124
125
126
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.vllm,
            pytest.mark.h100,
        ],
127
        model="deepseek-ai/DeepSeek-V2-Lite",
128
        script_args=[
129
130
131
132
133
134
135
136
137
            "--model",
            "deepseek-ai/DeepSeek-V2-Lite",
            "--num-nodes",
            "1",
            "--node-rank",
            "0",
            "--gpus-per-node",
            "2",
        ],
138
        timeout=700,
139
        request_payloads=[
Richard Huo's avatar
Richard Huo committed
140
141
            chat_payload_default(expected_response=["joke"]),
            completion_payload_default(expected_response=["joke"]),
142
        ],
143
    ),
144
145
    "multimodal_agg_llava_epd": VLLMConfig(
        name="multimodal_agg_llava_epd",
146
        directory=vllm_dir,
147
        script_name="agg_multimodal_epd.sh",
148
        marks=[pytest.mark.gpu_2],
149
        model="llava-hf/llava-1.5-7b-hf",
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
        script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["bus"],
                temperature=0.0,
            )
        ],
167
    ),
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
    "multimodal_agg_qwen_epd": VLLMConfig(
        name="multimodal_agg_qwen_epd",
        directory=vllm_dir,
        script_name="agg_multimodal_epd.sh",
        marks=[pytest.mark.gpu_2],
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        delayed_start=0,
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
        timeout=360,
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["bus"],
            )
        ],
    ),
193
194
    "multimodal_agg_qwen": VLLMConfig(
        name="multimodal_agg_qwen",
195
196
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
197
        marks=[pytest.mark.gpu_2],
198
        model="Qwen/Qwen2.5-VL-7B-Instruct",
199
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
200
        delayed_start=0,
201
        timeout=360,
202
        request_payloads=[
203
            # HTTP URL test
204
205
206
207
208
209
210
211
212
213
214
215
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["bus"],
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
            ),
            # Base64 data URL test (1x1 PNG inline, avoids network fetch)
            chat_payload(
                [
                    {"type": "text", "text": "What do you see in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAAAAAA6fptVAAAACklEQVR4nGNoAAAAggCBd81ytgAAAABJRU5ErkJggg=="
                        },
                    },
                ],
                repeat_count=1,
                expected_response=[],  # Just validate no error
            ),
231
        ],
232
    ),
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
    "multimodal_agg_llava": VLLMConfig(
        name="multimodal_agg_llava",
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
        marks=[
            pytest.mark.gpu_2,
            # https://github.com/ai-dynamo/dynamo/issues/4501
            pytest.mark.xfail(strict=False),
        ],
        model="llava-hf/llava-1.5-7b-hf",
        script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
        delayed_start=0,
        timeout=360,
        request_payloads=[
            # HTTP URL test
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["bus"],
                temperature=0.0,
            ),
            # String content test - verifies string → array conversion for multimodal templates
            chat_payload_default(
                repeat_count=1,
                expected_response=[],  # Just validate no error
            ),
        ],
    ),
269
    # TODO: Update this test case when we have video multimodal support in vllm official components
270
271
    "multimodal_video_agg": VLLMConfig(
        name="multimodal_video_agg",
272
        directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
273
        script_name="video_agg.sh",
274
        marks=[pytest.mark.gpu_2],
275
276
        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
        delayed_start=0,
277
        script_args=["--model", "llava-hf/LLaVA-NeXT-Video-7B-hf"],
278
        timeout=360,
279
280
281
282
283
284
285
286
287
288
289
290
291
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "Describe the video in detail"},
                    {
                        "type": "video_url",
                        "video_url": {
                            "url": "https://storage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["rabbit"],
292
                temperature=0.7,
293
294
            )
        ],
295
    ),
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
    "multimodal_audio_agg": VLLMConfig(
        name="multimodal_audio_agg",
        directory="/workspace/examples/multimodal",
        script_name="audio_agg.sh",
        marks=[pytest.mark.gpu_2],
        model="Qwen/Qwen2-Audio-7B-Instruct",
        delayed_start=0,
        script_args=["--model", "Qwen/Qwen2-Audio-7B-Instruct"],
        timeout=500,
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is recited in the audio?"},
                    {
                        "type": "audio_url",
                        "audio_url": {
                            "url": "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=[
                    "The original content of this audio is:'yet these thoughts affected Hester Pynne less with hope than apprehension.'"
                ],
                temperature=0.8,
            )
        ],
    ),
324
325
326
    # TODO: Enable this test case when we have 4 GPUs runners.
    # "multimodal_disagg": VLLMConfig(
    #     name="multimodal_disagg",
327
    #     directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
328
329
330
331
    #     script_name="disagg.sh",
    #     marks=[pytest.mark.gpu_4, pytest.mark.vllm],
    #     model="llava-hf/llava-1.5-7b-hf",
    #     delayed_start=45,
332
    #     script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
333
    # ),
334
335
336
}


Alec's avatar
Alec committed
337
@pytest.fixture(params=params_with_model_mark(vllm_configs))
338
339
340
341
342
def vllm_config_test(request):
    """Fixture that provides different vLLM test configurations"""
    return vllm_configs[request.param]


343
@pytest.mark.vllm
344
@pytest.mark.e2e
Alec's avatar
Alec committed
345
346
347
def test_serve_deployment(
    vllm_config_test, request, runtime_services, predownload_models
):
348
349
350
351
    """
    Test dynamo serve deployments with different graph configurations.
    """
    config = vllm_config_test
352
    run_serve_deployment(config, request)