test_vllm.py 17.5 KB
Newer Older
1
2
3
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

4
import base64
5
6
import logging
import os
7
import random
8
from dataclasses import dataclass, field
9
10
11

import pytest

12
13
14
15
16
from tests.serve.common import (
    WORKSPACE_DIR,
    params_with_model_mark,
    run_serve_deployment,
)
17
from tests.serve.conftest import MULTIMODAL_IMG_PATH, MULTIMODAL_IMG_URL
18
19
20
21
22
23
from tests.utils.engine_process import EngineConfig
from tests.utils.payload_builder import (
    chat_payload,
    chat_payload_default,
    completion_payload_default,
    metric_payload_default,
24
)
25
from tests.utils.payloads import ToolCallingChatPayload
26
27
28
29
30

logger = logging.getLogger(__name__)


@dataclass
31
class VLLMConfig(EngineConfig):
32
33
    """Configuration for vLLM test scenarios"""

34
    stragglers: list[str] = field(default_factory=lambda: ["VLLM:EngineCore"])
35
36


37
vllm_dir = os.environ.get("VLLM_DIR") or os.path.join(
38
    WORKSPACE_DIR, "examples/backends/vllm"
39
)
40

41

42
# vLLM test configurations
43
44
# NOTE: pytest.mark.gpu_1 tests take ~5.5 minutes total to run sequentially (with models pre-cached)
# TODO: Parallelize these tests to reduce total execution time
45
46
47
vllm_configs = {
    "aggregated": VLLMConfig(
        name="aggregated",
48
        directory=vllm_dir,
49
        script_name="agg.sh",
50
51
52
53
54
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.timeout(130),  # 3x measured time (43s)
        ],
55
        model="Qwen/Qwen3-0.6B",
56
57
58
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
59
            metric_payload_default(min_num_requests=6, backend="vllm"),
60
        ],
61
    ),
62
63
64
65
    "aggregated_lmcache": VLLMConfig(
        name="aggregated_lmcache",
        directory=vllm_dir,
        script_name="agg_lmcache.sh",
66
67
68
69
70
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.timeout(210),  # 3x estimated time (70s)
        ],
71
72
73
74
75
76
77
78
        model="Qwen/Qwen3-0.6B",
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
            metric_payload_default(min_num_requests=6, backend="vllm"),
            metric_payload_default(min_num_requests=6, backend="lmcache"),
        ],
    ),
79
80
81
82
    "aggregated_lmcache_multiproc": VLLMConfig(
        name="aggregated_lmcache_multiproc",
        directory=vllm_dir,
        script_name="agg_lmcache_multiproc.sh",
83
84
85
86
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.timeout(210),  # 3x estimated time (70s)
        ],
87
88
89
90
91
92
93
94
95
96
97
        model="Qwen/Qwen3-0.6B",
        env={
            "PROMETHEUS_MULTIPROC_DIR": f"/tmp/prometheus_multiproc_test_{os.getpid()}_{random.randint(0, 10000)}"
        },
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
            metric_payload_default(min_num_requests=6, backend="vllm"),
            metric_payload_default(min_num_requests=6, backend="lmcache"),
        ],
    ),
98
99
100
101
    "agg-request-plane-tcp": VLLMConfig(
        name="agg-request-plane-tcp",
        directory=vllm_dir,
        script_name="agg_request_planes.sh",
102
103
104
105
106
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.timeout(130),  # 3x measured time (43s)
        ],
107
108
109
110
111
112
113
114
115
116
117
        model="Qwen/Qwen3-0.6B",
        script_args=["--tcp"],
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
    ),
    "agg-request-plane-http": VLLMConfig(
        name="agg-request-plane-http",
        directory=vllm_dir,
        script_name="agg_request_planes.sh",
118
119
120
121
122
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.timeout(130),  # 3x measured time (43s)
        ],
123
124
125
126
127
128
129
        model="Qwen/Qwen3-0.6B",
        script_args=["--http"],
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
    ),
130
131
    "agg-router": VLLMConfig(
        name="agg-router",
132
        directory=vllm_dir,
133
        script_name="agg_router.sh",
134
        marks=[pytest.mark.gpu_2, pytest.mark.post_merge],
135
        model="Qwen/Qwen3-0.6B",
136
137
138
        request_payloads=[
            chat_payload_default(
                expected_log=[
139
                    r"ZMQ listener .* received batch with \d+ events \(seq=\d+(?:, [^)]*)?\)",
140
                    r"Event processor for worker_id \d+ processing event: Stored\(",
Yan Ru Pei's avatar
Yan Ru Pei committed
141
                    r"Selected worker: worker_id=\d+ dp_rank=.*?, logit: ",
142
143
144
145
146
147
                ]
            )
        ],
        env={
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info",
        },
148
    ),
149
150
    "disaggregated": VLLMConfig(
        name="disaggregated",
151
        directory=vllm_dir,
152
        script_name="disagg.sh",
153
        marks=[pytest.mark.gpu_2, pytest.mark.post_merge],
154
        model="Qwen/Qwen3-0.6B",
155
156
157
158
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
159
    ),
160
161
    "deepep": VLLMConfig(
        name="deepep",
162
        directory=vllm_dir,
163
        script_name="dsr1_dep.sh",
164
165
166
167
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.vllm,
            pytest.mark.h100,
168
            pytest.mark.nightly,
169
        ],
170
        model="deepseek-ai/DeepSeek-V2-Lite",
171
        script_args=[
172
173
174
175
176
177
178
179
180
            "--model",
            "deepseek-ai/DeepSeek-V2-Lite",
            "--num-nodes",
            "1",
            "--node-rank",
            "0",
            "--gpus-per-node",
            "2",
        ],
181
        timeout=700,
182
        request_payloads=[
183
184
            chat_payload_default(),
            completion_payload_default(),
185
        ],
186
    ),
187
188
    "multimodal_agg_llava_epd": VLLMConfig(
        name="multimodal_agg_llava_epd",
189
        directory=vllm_dir,
190
        script_name="agg_multimodal_epd.sh",
191
        marks=[pytest.mark.gpu_2, pytest.mark.nightly],
192
        model="llava-hf/llava-1.5-7b-hf",
193
194
195
196
        script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
        request_payloads=[
            chat_payload(
                [
197
198
199
200
                    {
                        "type": "text",
                        "text": "What colors are in the following image? Respond only with the colors.",
                    },
201
202
                    {
                        "type": "image_url",
203
                        "image_url": {"url": MULTIMODAL_IMG_URL},
204
205
206
                    },
                ],
                repeat_count=1,
207
                expected_response=["purple"],
208
                temperature=0.0,
209
                max_tokens=100,
210
211
            )
        ],
212
    ),
213
214
215
216
    "multimodal_agg_qwen_epd": VLLMConfig(
        name="multimodal_agg_qwen_epd",
        directory=vllm_dir,
        script_name="agg_multimodal_epd.sh",
217
        marks=[pytest.mark.gpu_2, pytest.mark.nightly],
218
219
220
221
222
223
224
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        delayed_start=0,
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
        timeout=360,
        request_payloads=[
            chat_payload(
                [
225
226
227
228
                    {
                        "type": "text",
                        "text": "What colors are in the following image? Respond only with the colors.",
                    },
229
230
                    {
                        "type": "image_url",
231
                        "image_url": {"url": MULTIMODAL_IMG_URL},
232
233
234
                    },
                ],
                repeat_count=1,
235
236
                expected_response=["purple"],
                max_tokens=100,
237
238
239
            )
        ],
    ),
240
241
    "multimodal_agg_qwen": VLLMConfig(
        name="multimodal_agg_qwen",
242
243
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
244
        marks=[pytest.mark.gpu_2, pytest.mark.nightly],
245
        model="Qwen/Qwen2.5-VL-7B-Instruct",
246
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
247
        delayed_start=0,
248
        timeout=360,
249
250
251
252
        request_payloads=[
            chat_payload(
                [
                    {
253
254
                        "type": "text",
                        "text": "What colors are in the following image? Respond only with the colors.",
255
                    },
256
257
                    {
                        "type": "image_url",
258
                        "image_url": {"url": MULTIMODAL_IMG_URL},
259
260
261
                    },
                ],
                repeat_count=1,
262
263
                expected_response=["purple"],
                max_tokens=100,
264
            ),
265
        ],
266
    ),
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
    "multimodal_agg_llava": VLLMConfig(
        name="multimodal_agg_llava",
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
        marks=[
            pytest.mark.gpu_2,
            # https://github.com/ai-dynamo/dynamo/issues/4501
            pytest.mark.xfail(strict=False),
        ],
        model="llava-hf/llava-1.5-7b-hf",
        script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
        delayed_start=0,
        timeout=360,
        request_payloads=[
            # HTTP URL test
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["bus"],
                temperature=0.0,
            ),
            # String content test - verifies string → array conversion for multimodal templates
            chat_payload_default(
                repeat_count=1,
                expected_response=[],  # Just validate no error
            ),
        ],
    ),
303
    # TODO: Update this test case when we have video multimodal support in vllm official components
304
305
    "multimodal_video_agg": VLLMConfig(
        name="multimodal_video_agg",
306
        directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
307
        script_name="video_agg.sh",
308
        marks=[pytest.mark.gpu_2, pytest.mark.nightly],
309
310
        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
        delayed_start=0,
311
        script_args=["--model", "llava-hf/LLaVA-NeXT-Video-7B-hf"],
312
        timeout=360,
313
314
315
316
317
318
319
320
321
322
323
324
325
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "Describe the video in detail"},
                    {
                        "type": "video_url",
                        "video_url": {
                            "url": "https://storage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["rabbit"],
326
                temperature=0.7,
327
328
            )
        ],
329
    ),
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
    "multimodal_audio_agg": VLLMConfig(
        name="multimodal_audio_agg",
        directory="/workspace/examples/multimodal",
        script_name="audio_agg.sh",
        marks=[pytest.mark.gpu_2],
        model="Qwen/Qwen2-Audio-7B-Instruct",
        delayed_start=0,
        script_args=["--model", "Qwen/Qwen2-Audio-7B-Instruct"],
        timeout=500,
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is recited in the audio?"},
                    {
                        "type": "audio_url",
                        "audio_url": {
                            "url": "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=[
                    "The original content of this audio is:'yet these thoughts affected Hester Pynne less with hope than apprehension.'"
                ],
                temperature=0.8,
            )
        ],
    ),
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
    "aggregated_toolcalling": VLLMConfig(
        name="aggregated_toolcalling",
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
        marks=[pytest.mark.gpu_2, pytest.mark.multimodal],
        model="Qwen/Qwen3-VL-30B-A3B-Instruct-FP8",
        script_args=[
            "--model",
            "Qwen/Qwen3-VL-30B-A3B-Instruct-FP8",
            "--max-model-len",
            "10000",
            "--dyn-tool-call-parser",
            "hermes",
        ],
        delayed_start=0,
        timeout=600,
        request_payloads=[
            ToolCallingChatPayload(
                body={
                    "messages": [
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "text",
                                    "text": "Describe what you see in this image in detail.",
                                },
                                {
                                    "type": "image_url",
                                    "image_url": {"url": MULTIMODAL_IMG_URL},
                                },
                            ],
                        }
                    ],
                    "tools": [
                        {
                            "type": "function",
                            "function": {
                                "name": "describe_image",
                                "description": "Provides detailed description of objects and scenes in an image",
                                "parameters": {
                                    "type": "object",
                                    "properties": {
                                        "objects": {
                                            "type": "array",
                                            "items": {"type": "string"},
                                            "description": "List of objects detected in the image",
                                        },
                                        "scene": {
                                            "type": "string",
                                            "description": "Overall scene description",
                                        },
                                    },
                                    "required": ["objects", "scene"],
                                },
                            },
                        }
                    ],
                    "tool_choice": "auto",
                    "max_tokens": 1024,
                },
                repeat_count=1,
                expected_response=["purple"],  # Validate image understanding
                expected_log=[],
                expected_tool_name="describe_image",  # Validate tool call happened
            )
        ],
    ),
426
427
428
    # TODO: Enable this test case when we have 4 GPUs runners.
    # "multimodal_disagg": VLLMConfig(
    #     name="multimodal_disagg",
429
    #     directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
430
431
432
433
    #     script_name="disagg.sh",
    #     marks=[pytest.mark.gpu_4, pytest.mark.vllm],
    #     model="llava-hf/llava-1.5-7b-hf",
    #     delayed_start=45,
434
    #     script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
435
    # ),
436
437
438
439
    "completions_only": VLLMConfig(
        name="completions_only",
        directory=vllm_dir,
        script_name="agg.sh",
440
441
442
443
        marks=[
            pytest.mark.gpu_1,
            pytest.mark.timeout(180),  # 3x estimated time (60s) for 7B model
        ],
444
445
446
447
448
449
450
451
452
453
454
        model="deepseek-ai/deepseek-llm-7b-base",
        script_args=[
            "--model",
            "deepseek-ai/deepseek-llm-7b-base",
            "--dyn-endpoint-types",
            "completions",
        ],
        request_payloads=[
            completion_payload_default(),
        ],
    ),
455
456
457
}


Alec's avatar
Alec committed
458
@pytest.fixture(params=params_with_model_mark(vllm_configs))
459
460
461
462
463
def vllm_config_test(request):
    """Fixture that provides different vLLM test configurations"""
    return vllm_configs[request.param]


464
@pytest.mark.vllm
465
@pytest.mark.e2e
466
@pytest.mark.nightly
Alec's avatar
Alec committed
467
def test_serve_deployment(
468
    vllm_config_test, request, runtime_services, predownload_models, image_server
Alec's avatar
Alec committed
469
):
470
471
472
473
    """
    Test dynamo serve deployments with different graph configurations.
    """
    config = vllm_config_test
474
    run_serve_deployment(config, request)
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521


@pytest.mark.vllm
@pytest.mark.e2e
@pytest.mark.gpu_2
def test_multimodal_b64(request, runtime_services, predownload_models):
    """
    Test multimodal inference with base64 url passthrough.

    This test is separate because it loads the required image at runtime
    (not collection time), ensuring it only fails when actually executed.
    """
    # Load B64 image at test execution time
    with open(MULTIMODAL_IMG_PATH, "rb") as f:
        b64_img = base64.b64encode(f.read()).decode()

    # Create payload with B64 image
    b64_payload = chat_payload(
        [
            {
                "type": "text",
                "text": "What colors are in the following image? Respond only with the colors.",
            },
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/png;base64,{b64_img}"},
            },
        ],
        repeat_count=1,
        expected_response=["purple"],
        max_tokens=100,
    )

    # Create test config
    config = VLLMConfig(
        name="test_multimodal_b64",
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
        marks=[],  # markers at function-level
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
        delayed_start=0,
        timeout=360,
        request_payloads=[b64_payload],
    )

    run_serve_deployment(config, request)