test_vllm.py 12.8 KB
Newer Older
1
2
3
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

4
import base64
5
6
import logging
import os
7
from dataclasses import dataclass, field
8
9
10

import pytest

11
12
13
14
15
from tests.serve.common import (
    WORKSPACE_DIR,
    params_with_model_mark,
    run_serve_deployment,
)
16
from tests.serve.conftest import MULTIMODAL_IMG_PATH, MULTIMODAL_IMG_URL
17
18
19
20
21
22
from tests.utils.engine_process import EngineConfig
from tests.utils.payload_builder import (
    chat_payload,
    chat_payload_default,
    completion_payload_default,
    metric_payload_default,
23
24
25
26
27
28
)

logger = logging.getLogger(__name__)


@dataclass
29
class VLLMConfig(EngineConfig):
30
31
    """Configuration for vLLM test scenarios"""

32
    stragglers: list[str] = field(default_factory=lambda: ["VLLM:EngineCore"])
33
34


35
vllm_dir = os.environ.get("VLLM_DIR") or os.path.join(
36
    WORKSPACE_DIR, "examples/backends/vllm"
37
)
38

39

40
41
42
43
# vLLM test configurations
vllm_configs = {
    "aggregated": VLLMConfig(
        name="aggregated",
44
        directory=vllm_dir,
45
        script_name="agg.sh",
46
        marks=[pytest.mark.gpu_1],
47
        model="Qwen/Qwen3-0.6B",
48
49
50
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
51
            metric_payload_default(min_num_requests=6, backend="vllm"),
52
        ],
53
    ),
54
55
56
57
58
59
60
61
62
63
64
65
66
    "aggregated_lmcache": VLLMConfig(
        name="aggregated_lmcache",
        directory=vllm_dir,
        script_name="agg_lmcache.sh",
        marks=[pytest.mark.gpu_1],
        model="Qwen/Qwen3-0.6B",
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
            metric_payload_default(min_num_requests=6, backend="vllm"),
            metric_payload_default(min_num_requests=6, backend="lmcache"),
        ],
    ),
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
    "agg-request-plane-tcp": VLLMConfig(
        name="agg-request-plane-tcp",
        directory=vllm_dir,
        script_name="agg_request_planes.sh",
        marks=[pytest.mark.gpu_1],
        model="Qwen/Qwen3-0.6B",
        script_args=["--tcp"],
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
    ),
    "agg-request-plane-http": VLLMConfig(
        name="agg-request-plane-http",
        directory=vllm_dir,
        script_name="agg_request_planes.sh",
        marks=[pytest.mark.gpu_1],
        model="Qwen/Qwen3-0.6B",
        script_args=["--http"],
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
    ),
91
92
    "agg-router": VLLMConfig(
        name="agg-router",
93
        directory=vllm_dir,
94
        script_name="agg_router.sh",
95
        marks=[pytest.mark.gpu_2, pytest.mark.post_merge],
96
        model="Qwen/Qwen3-0.6B",
97
98
99
        request_payloads=[
            chat_payload_default(
                expected_log=[
100
                    r"ZMQ listener .* received batch with \d+ events \(seq=\d+(?:, [^)]*)?\)",
101
                    r"Event processor for worker_id \d+ processing event: Stored\(",
Yan Ru Pei's avatar
Yan Ru Pei committed
102
                    r"Selected worker: worker_id=\d+ dp_rank=.*?, logit: ",
103
104
105
106
107
108
                ]
            )
        ],
        env={
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info",
        },
109
    ),
110
111
    "disaggregated": VLLMConfig(
        name="disaggregated",
112
        directory=vllm_dir,
113
        script_name="disagg.sh",
114
        marks=[pytest.mark.gpu_2, pytest.mark.post_merge],
115
        model="Qwen/Qwen3-0.6B",
116
117
118
119
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
120
    ),
121
122
    "deepep": VLLMConfig(
        name="deepep",
123
        directory=vllm_dir,
124
        script_name="dsr1_dep.sh",
125
126
127
128
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.vllm,
            pytest.mark.h100,
129
            pytest.mark.nightly,
130
        ],
131
        model="deepseek-ai/DeepSeek-V2-Lite",
132
        script_args=[
133
134
135
136
137
138
139
140
141
            "--model",
            "deepseek-ai/DeepSeek-V2-Lite",
            "--num-nodes",
            "1",
            "--node-rank",
            "0",
            "--gpus-per-node",
            "2",
        ],
142
        timeout=700,
143
        request_payloads=[
Richard Huo's avatar
Richard Huo committed
144
145
            chat_payload_default(expected_response=["joke"]),
            completion_payload_default(expected_response=["joke"]),
146
        ],
147
    ),
148
149
    "multimodal_agg_llava_epd": VLLMConfig(
        name="multimodal_agg_llava_epd",
150
        directory=vllm_dir,
151
        script_name="agg_multimodal_epd.sh",
152
        marks=[pytest.mark.gpu_2, pytest.mark.nightly],
153
        model="llava-hf/llava-1.5-7b-hf",
154
155
156
157
        script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
        request_payloads=[
            chat_payload(
                [
158
159
160
161
                    {
                        "type": "text",
                        "text": "What colors are in the following image? Respond only with the colors.",
                    },
162
163
                    {
                        "type": "image_url",
164
                        "image_url": {"url": MULTIMODAL_IMG_URL},
165
166
167
                    },
                ],
                repeat_count=1,
168
                expected_response=["purple"],
169
                temperature=0.0,
170
                max_tokens=100,
171
172
            )
        ],
173
    ),
174
175
176
177
    "multimodal_agg_qwen_epd": VLLMConfig(
        name="multimodal_agg_qwen_epd",
        directory=vllm_dir,
        script_name="agg_multimodal_epd.sh",
178
        marks=[pytest.mark.gpu_2, pytest.mark.nightly],
179
180
181
182
183
184
185
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        delayed_start=0,
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
        timeout=360,
        request_payloads=[
            chat_payload(
                [
186
187
188
189
                    {
                        "type": "text",
                        "text": "What colors are in the following image? Respond only with the colors.",
                    },
190
191
                    {
                        "type": "image_url",
192
                        "image_url": {"url": MULTIMODAL_IMG_URL},
193
194
195
                    },
                ],
                repeat_count=1,
196
197
                expected_response=["purple"],
                max_tokens=100,
198
199
200
            )
        ],
    ),
201
202
    "multimodal_agg_qwen": VLLMConfig(
        name="multimodal_agg_qwen",
203
204
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
205
        marks=[pytest.mark.gpu_2, pytest.mark.nightly],
206
        model="Qwen/Qwen2.5-VL-7B-Instruct",
207
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
208
        delayed_start=0,
209
        timeout=360,
210
211
212
213
        request_payloads=[
            chat_payload(
                [
                    {
214
215
                        "type": "text",
                        "text": "What colors are in the following image? Respond only with the colors.",
216
                    },
217
218
                    {
                        "type": "image_url",
219
                        "image_url": {"url": MULTIMODAL_IMG_URL},
220
221
222
                    },
                ],
                repeat_count=1,
223
224
                expected_response=["purple"],
                max_tokens=100,
225
            ),
226
        ],
227
    ),
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
    "multimodal_agg_llava": VLLMConfig(
        name="multimodal_agg_llava",
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
        marks=[
            pytest.mark.gpu_2,
            # https://github.com/ai-dynamo/dynamo/issues/4501
            pytest.mark.xfail(strict=False),
        ],
        model="llava-hf/llava-1.5-7b-hf",
        script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
        delayed_start=0,
        timeout=360,
        request_payloads=[
            # HTTP URL test
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["bus"],
                temperature=0.0,
            ),
            # String content test - verifies string → array conversion for multimodal templates
            chat_payload_default(
                repeat_count=1,
                expected_response=[],  # Just validate no error
            ),
        ],
    ),
264
    # TODO: Update this test case when we have video multimodal support in vllm official components
265
266
    "multimodal_video_agg": VLLMConfig(
        name="multimodal_video_agg",
267
        directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
268
        script_name="video_agg.sh",
269
        marks=[pytest.mark.gpu_2, pytest.mark.nightly],
270
271
        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
        delayed_start=0,
272
        script_args=["--model", "llava-hf/LLaVA-NeXT-Video-7B-hf"],
273
        timeout=360,
274
275
276
277
278
279
280
281
282
283
284
285
286
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "Describe the video in detail"},
                    {
                        "type": "video_url",
                        "video_url": {
                            "url": "https://storage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["rabbit"],
287
                temperature=0.7,
288
289
            )
        ],
290
    ),
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
    "multimodal_audio_agg": VLLMConfig(
        name="multimodal_audio_agg",
        directory="/workspace/examples/multimodal",
        script_name="audio_agg.sh",
        marks=[pytest.mark.gpu_2],
        model="Qwen/Qwen2-Audio-7B-Instruct",
        delayed_start=0,
        script_args=["--model", "Qwen/Qwen2-Audio-7B-Instruct"],
        timeout=500,
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is recited in the audio?"},
                    {
                        "type": "audio_url",
                        "audio_url": {
                            "url": "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=[
                    "The original content of this audio is:'yet these thoughts affected Hester Pynne less with hope than apprehension.'"
                ],
                temperature=0.8,
            )
        ],
    ),
319
320
321
    # TODO: Enable this test case when we have 4 GPUs runners.
    # "multimodal_disagg": VLLMConfig(
    #     name="multimodal_disagg",
322
    #     directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
323
324
325
326
    #     script_name="disagg.sh",
    #     marks=[pytest.mark.gpu_4, pytest.mark.vllm],
    #     model="llava-hf/llava-1.5-7b-hf",
    #     delayed_start=45,
327
    #     script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
328
    # ),
329
330
331
}


Alec's avatar
Alec committed
332
@pytest.fixture(params=params_with_model_mark(vllm_configs))
333
334
335
336
337
def vllm_config_test(request):
    """Fixture that provides different vLLM test configurations"""
    return vllm_configs[request.param]


338
@pytest.mark.vllm
339
@pytest.mark.e2e
340
@pytest.mark.nightly
Alec's avatar
Alec committed
341
def test_serve_deployment(
342
    vllm_config_test, request, runtime_services, predownload_models, image_server
Alec's avatar
Alec committed
343
):
344
345
346
347
    """
    Test dynamo serve deployments with different graph configurations.
    """
    config = vllm_config_test
348
    run_serve_deployment(config, request)
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395


@pytest.mark.vllm
@pytest.mark.e2e
@pytest.mark.gpu_2
def test_multimodal_b64(request, runtime_services, predownload_models):
    """
    Test multimodal inference with base64 url passthrough.

    This test is separate because it loads the required image at runtime
    (not collection time), ensuring it only fails when actually executed.
    """
    # Load B64 image at test execution time
    with open(MULTIMODAL_IMG_PATH, "rb") as f:
        b64_img = base64.b64encode(f.read()).decode()

    # Create payload with B64 image
    b64_payload = chat_payload(
        [
            {
                "type": "text",
                "text": "What colors are in the following image? Respond only with the colors.",
            },
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/png;base64,{b64_img}"},
            },
        ],
        repeat_count=1,
        expected_response=["purple"],
        max_tokens=100,
    )

    # Create test config
    config = VLLMConfig(
        name="test_multimodal_b64",
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
        marks=[],  # markers at function-level
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
        delayed_start=0,
        timeout=360,
        request_payloads=[b64_payload],
    )

    run_serve_deployment(config, request)