"...crds/templates/nvidia.com_dynamocomponentdeployments.yaml" did not exist on "c544e8ec4cbd1af390d49f74899720542ab985e4"
test_vllm.py 12.6 KB
Newer Older
1
2
3
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

4
import base64
5
6
import logging
import os
7
from dataclasses import dataclass, field
8
9
10

import pytest

11
12
13
14
15
from tests.serve.common import (
    WORKSPACE_DIR,
    params_with_model_mark,
    run_serve_deployment,
)
16
from tests.serve.conftest import MULTIMODAL_IMG_PATH, MULTIMODAL_IMG_URL
17
18
19
20
21
22
from tests.utils.engine_process import EngineConfig
from tests.utils.payload_builder import (
    chat_payload,
    chat_payload_default,
    completion_payload_default,
    metric_payload_default,
23
24
25
26
27
28
)

logger = logging.getLogger(__name__)


@dataclass
29
class VLLMConfig(EngineConfig):
30
31
    """Configuration for vLLM test scenarios"""

32
    stragglers: list[str] = field(default_factory=lambda: ["VLLM:EngineCore"])
33
34


35
vllm_dir = os.environ.get("VLLM_DIR") or os.path.join(
36
    WORKSPACE_DIR, "examples/backends/vllm"
37
)
38

39

40
41
42
43
# vLLM test configurations
vllm_configs = {
    "aggregated": VLLMConfig(
        name="aggregated",
44
        directory=vllm_dir,
45
        script_name="agg.sh",
46
        marks=[pytest.mark.gpu_1],
47
        model="Qwen/Qwen3-0.6B",
48
49
50
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
51
            metric_payload_default(min_num_requests=6, backend="vllm"),
52
        ],
53
    ),
54
55
56
57
58
59
60
61
62
63
64
65
66
    "aggregated_lmcache": VLLMConfig(
        name="aggregated_lmcache",
        directory=vllm_dir,
        script_name="agg_lmcache.sh",
        marks=[pytest.mark.gpu_1],
        model="Qwen/Qwen3-0.6B",
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
            metric_payload_default(min_num_requests=6, backend="vllm"),
            metric_payload_default(min_num_requests=6, backend="lmcache"),
        ],
    ),
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
    "agg-request-plane-tcp": VLLMConfig(
        name="agg-request-plane-tcp",
        directory=vllm_dir,
        script_name="agg_request_planes.sh",
        marks=[pytest.mark.gpu_1],
        model="Qwen/Qwen3-0.6B",
        script_args=["--tcp"],
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
    ),
    "agg-request-plane-http": VLLMConfig(
        name="agg-request-plane-http",
        directory=vllm_dir,
        script_name="agg_request_planes.sh",
        marks=[pytest.mark.gpu_1],
        model="Qwen/Qwen3-0.6B",
        script_args=["--http"],
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
    ),
91
92
    "agg-router": VLLMConfig(
        name="agg-router",
93
        directory=vllm_dir,
94
        script_name="agg_router.sh",
95
        marks=[pytest.mark.gpu_2],
96
        model="Qwen/Qwen3-0.6B",
97
98
99
        request_payloads=[
            chat_payload_default(
                expected_log=[
100
                    r"ZMQ listener .* received batch with \d+ events \(seq=\d+(?:, [^)]*)?\)",
101
                    r"Event processor for worker_id \d+ processing event: Stored\(",
Yan Ru Pei's avatar
Yan Ru Pei committed
102
                    r"Selected worker: worker_id=\d+ dp_rank=.*?, logit: ",
103
104
105
106
107
108
                ]
            )
        ],
        env={
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info",
        },
109
    ),
110
111
    "disaggregated": VLLMConfig(
        name="disaggregated",
112
        directory=vllm_dir,
113
        script_name="disagg.sh",
114
        marks=[pytest.mark.gpu_2],
115
        model="Qwen/Qwen3-0.6B",
116
117
118
119
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
120
    ),
121
122
    "deepep": VLLMConfig(
        name="deepep",
123
        directory=vllm_dir,
124
        script_name="dsr1_dep.sh",
125
126
127
128
129
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.vllm,
            pytest.mark.h100,
        ],
130
        model="deepseek-ai/DeepSeek-V2-Lite",
131
        script_args=[
132
133
134
135
136
137
138
139
140
            "--model",
            "deepseek-ai/DeepSeek-V2-Lite",
            "--num-nodes",
            "1",
            "--node-rank",
            "0",
            "--gpus-per-node",
            "2",
        ],
141
        timeout=700,
142
        request_payloads=[
Richard Huo's avatar
Richard Huo committed
143
144
            chat_payload_default(expected_response=["joke"]),
            completion_payload_default(expected_response=["joke"]),
145
        ],
146
    ),
147
148
    "multimodal_agg_llava_epd": VLLMConfig(
        name="multimodal_agg_llava_epd",
149
        directory=vllm_dir,
150
        script_name="agg_multimodal_epd.sh",
151
        marks=[pytest.mark.gpu_2],
152
        model="llava-hf/llava-1.5-7b-hf",
153
154
155
156
        script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
        request_payloads=[
            chat_payload(
                [
157
158
159
160
                    {
                        "type": "text",
                        "text": "What colors are in the following image? Respond only with the colors.",
                    },
161
162
                    {
                        "type": "image_url",
163
                        "image_url": {"url": MULTIMODAL_IMG_URL},
164
165
166
                    },
                ],
                repeat_count=1,
167
                expected_response=["purple"],
168
                temperature=0.0,
169
                max_tokens=100,
170
171
            )
        ],
172
    ),
173
174
175
176
177
178
179
180
181
182
183
184
    "multimodal_agg_qwen_epd": VLLMConfig(
        name="multimodal_agg_qwen_epd",
        directory=vllm_dir,
        script_name="agg_multimodal_epd.sh",
        marks=[pytest.mark.gpu_2],
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        delayed_start=0,
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
        timeout=360,
        request_payloads=[
            chat_payload(
                [
185
186
187
188
                    {
                        "type": "text",
                        "text": "What colors are in the following image? Respond only with the colors.",
                    },
189
190
                    {
                        "type": "image_url",
191
                        "image_url": {"url": MULTIMODAL_IMG_URL},
192
193
194
                    },
                ],
                repeat_count=1,
195
196
                expected_response=["purple"],
                max_tokens=100,
197
198
199
            )
        ],
    ),
200
201
    "multimodal_agg_qwen": VLLMConfig(
        name="multimodal_agg_qwen",
202
203
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
204
        marks=[pytest.mark.gpu_2],
205
        model="Qwen/Qwen2.5-VL-7B-Instruct",
206
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
207
        delayed_start=0,
208
        timeout=360,
209
210
211
212
        request_payloads=[
            chat_payload(
                [
                    {
213
214
                        "type": "text",
                        "text": "What colors are in the following image? Respond only with the colors.",
215
                    },
216
217
                    {
                        "type": "image_url",
218
                        "image_url": {"url": MULTIMODAL_IMG_URL},
219
220
221
                    },
                ],
                repeat_count=1,
222
223
                expected_response=["purple"],
                max_tokens=100,
224
            ),
225
        ],
226
    ),
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
    "multimodal_agg_llava": VLLMConfig(
        name="multimodal_agg_llava",
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
        marks=[
            pytest.mark.gpu_2,
            # https://github.com/ai-dynamo/dynamo/issues/4501
            pytest.mark.xfail(strict=False),
        ],
        model="llava-hf/llava-1.5-7b-hf",
        script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
        delayed_start=0,
        timeout=360,
        request_payloads=[
            # HTTP URL test
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["bus"],
                temperature=0.0,
            ),
            # String content test - verifies string → array conversion for multimodal templates
            chat_payload_default(
                repeat_count=1,
                expected_response=[],  # Just validate no error
            ),
        ],
    ),
263
    # TODO: Update this test case when we have video multimodal support in vllm official components
264
265
    "multimodal_video_agg": VLLMConfig(
        name="multimodal_video_agg",
266
        directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
267
        script_name="video_agg.sh",
268
        marks=[pytest.mark.gpu_2],
269
270
        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
        delayed_start=0,
271
        script_args=["--model", "llava-hf/LLaVA-NeXT-Video-7B-hf"],
272
        timeout=360,
273
274
275
276
277
278
279
280
281
282
283
284
285
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "Describe the video in detail"},
                    {
                        "type": "video_url",
                        "video_url": {
                            "url": "https://storage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["rabbit"],
286
                temperature=0.7,
287
288
            )
        ],
289
    ),
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
    "multimodal_audio_agg": VLLMConfig(
        name="multimodal_audio_agg",
        directory="/workspace/examples/multimodal",
        script_name="audio_agg.sh",
        marks=[pytest.mark.gpu_2],
        model="Qwen/Qwen2-Audio-7B-Instruct",
        delayed_start=0,
        script_args=["--model", "Qwen/Qwen2-Audio-7B-Instruct"],
        timeout=500,
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is recited in the audio?"},
                    {
                        "type": "audio_url",
                        "audio_url": {
                            "url": "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=[
                    "The original content of this audio is:'yet these thoughts affected Hester Pynne less with hope than apprehension.'"
                ],
                temperature=0.8,
            )
        ],
    ),
318
319
320
    # TODO: Enable this test case when we have 4 GPUs runners.
    # "multimodal_disagg": VLLMConfig(
    #     name="multimodal_disagg",
321
    #     directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
322
323
324
325
    #     script_name="disagg.sh",
    #     marks=[pytest.mark.gpu_4, pytest.mark.vllm],
    #     model="llava-hf/llava-1.5-7b-hf",
    #     delayed_start=45,
326
    #     script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
327
    # ),
328
329
330
}


Alec's avatar
Alec committed
331
@pytest.fixture(params=params_with_model_mark(vllm_configs))
332
333
334
335
336
def vllm_config_test(request):
    """Fixture that provides different vLLM test configurations"""
    return vllm_configs[request.param]


337
@pytest.mark.vllm
338
@pytest.mark.e2e
Alec's avatar
Alec committed
339
def test_serve_deployment(
340
    vllm_config_test, request, runtime_services, predownload_models, image_server
Alec's avatar
Alec committed
341
):
342
343
344
345
    """
    Test dynamo serve deployments with different graph configurations.
    """
    config = vllm_config_test
346
    run_serve_deployment(config, request)
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393


@pytest.mark.vllm
@pytest.mark.e2e
@pytest.mark.gpu_2
def test_multimodal_b64(request, runtime_services, predownload_models):
    """
    Test multimodal inference with base64 url passthrough.

    This test is separate because it loads the required image at runtime
    (not collection time), ensuring it only fails when actually executed.
    """
    # Load B64 image at test execution time
    with open(MULTIMODAL_IMG_PATH, "rb") as f:
        b64_img = base64.b64encode(f.read()).decode()

    # Create payload with B64 image
    b64_payload = chat_payload(
        [
            {
                "type": "text",
                "text": "What colors are in the following image? Respond only with the colors.",
            },
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/png;base64,{b64_img}"},
            },
        ],
        repeat_count=1,
        expected_response=["purple"],
        max_tokens=100,
    )

    # Create test config
    config = VLLMConfig(
        name="test_multimodal_b64",
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
        marks=[],  # markers at function-level
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
        delayed_start=0,
        timeout=360,
        request_payloads=[b64_payload],
    )

    run_serve_deployment(config, request)