test_vllm.py 9.86 KB
Newer Older
1
2
3
4
5
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import logging
import os
6
from dataclasses import dataclass, field
7
8
9

import pytest

10
11
12
13
14
from tests.serve.common import (
    WORKSPACE_DIR,
    params_with_model_mark,
    run_serve_deployment,
)
15
16
17
18
19
20
from tests.utils.engine_process import EngineConfig
from tests.utils.payload_builder import (
    chat_payload,
    chat_payload_default,
    completion_payload_default,
    metric_payload_default,
21
22
23
24
25
26
)

logger = logging.getLogger(__name__)


@dataclass
27
class VLLMConfig(EngineConfig):
28
29
    """Configuration for vLLM test scenarios"""

30
    stragglers: list[str] = field(default_factory=lambda: ["VLLM:EngineCore"])
31
32


33
vllm_dir = os.environ.get("VLLM_DIR") or os.path.join(
34
    WORKSPACE_DIR, "examples/backends/vllm"
35
)
36
37
38
39
40

# vLLM test configurations
vllm_configs = {
    "aggregated": VLLMConfig(
        name="aggregated",
41
        directory=vllm_dir,
42
        script_name="agg.sh",
43
        marks=[pytest.mark.gpu_1],
44
        model="Qwen/Qwen3-0.6B",
45
46
47
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
48
            metric_payload_default(min_num_requests=6, backend="vllm"),
49
        ],
50
    ),
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
    "agg-request-plane-tcp": VLLMConfig(
        name="agg-request-plane-tcp",
        directory=vllm_dir,
        script_name="agg_request_planes.sh",
        marks=[pytest.mark.gpu_1],
        model="Qwen/Qwen3-0.6B",
        script_args=["--tcp"],
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
    ),
    "agg-request-plane-http": VLLMConfig(
        name="agg-request-plane-http",
        directory=vllm_dir,
        script_name="agg_request_planes.sh",
        marks=[pytest.mark.gpu_1],
        model="Qwen/Qwen3-0.6B",
        script_args=["--http"],
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
    ),
75
76
    "agg-router": VLLMConfig(
        name="agg-router",
77
        directory=vllm_dir,
78
        script_name="agg_router.sh",
79
        marks=[pytest.mark.gpu_2],
80
        model="Qwen/Qwen3-0.6B",
81
82
83
        request_payloads=[
            chat_payload_default(
                expected_log=[
84
                    r"ZMQ listener .* received batch with \d+ events \(seq=\d+(?:, [^)]*)?\)",
85
                    r"Event processor for worker_id \d+ processing event: Stored\(",
Yan Ru Pei's avatar
Yan Ru Pei committed
86
                    r"Selected worker: worker_id=\d+ dp_rank=.*?, logit: ",
87
88
89
90
91
92
                ]
            )
        ],
        env={
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info",
        },
93
    ),
94
95
    "disaggregated": VLLMConfig(
        name="disaggregated",
96
        directory=vllm_dir,
97
        script_name="disagg.sh",
98
        marks=[pytest.mark.gpu_2],
99
        model="Qwen/Qwen3-0.6B",
100
101
102
103
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
104
    ),
105
106
    "deepep": VLLMConfig(
        name="deepep",
107
        directory=vllm_dir,
108
        script_name="dsr1_dep.sh",
109
110
111
112
113
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.vllm,
            pytest.mark.h100,
        ],
114
        model="deepseek-ai/DeepSeek-V2-Lite",
115
        script_args=[
116
117
118
119
120
121
122
123
124
            "--model",
            "deepseek-ai/DeepSeek-V2-Lite",
            "--num-nodes",
            "1",
            "--node-rank",
            "0",
            "--gpus-per-node",
            "2",
        ],
125
        timeout=700,
126
        request_payloads=[
Richard Huo's avatar
Richard Huo committed
127
128
            chat_payload_default(expected_response=["joke"]),
            completion_payload_default(expected_response=["joke"]),
129
        ],
130
    ),
131
132
    "multimodal_agg_llava_epd": VLLMConfig(
        name="multimodal_agg_llava_epd",
133
        directory=vllm_dir,
134
        script_name="agg_multimodal_epd.sh",
135
        marks=[pytest.mark.gpu_2],
136
        model="llava-hf/llava-1.5-7b-hf",
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
        script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["bus"],
                temperature=0.0,
            )
        ],
154
    ),
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
    "multimodal_agg_qwen_epd": VLLMConfig(
        name="multimodal_agg_qwen_epd",
        directory=vllm_dir,
        script_name="agg_multimodal_epd.sh",
        marks=[pytest.mark.gpu_2],
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        delayed_start=0,
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
        timeout=360,
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["bus"],
            )
        ],
    ),
180
181
    "multimodal_agg_qwen": VLLMConfig(
        name="multimodal_agg_qwen",
182
183
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
184
        marks=[pytest.mark.gpu_2],
185
        model="Qwen/Qwen2.5-VL-7B-Instruct",
186
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
187
        delayed_start=0,
188
        timeout=360,
189
        request_payloads=[
190
            # HTTP URL test
191
192
193
194
195
196
197
198
199
200
201
202
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["bus"],
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
            ),
            # Base64 data URL test (1x1 PNG inline, avoids network fetch)
            chat_payload(
                [
                    {"type": "text", "text": "What do you see in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAAAAAA6fptVAAAACklEQVR4nGNoAAAAggCBd81ytgAAAABJRU5ErkJggg=="
                        },
                    },
                ],
                repeat_count=1,
                expected_response=[],  # Just validate no error
            ),
218
        ],
219
    ),
220
    # TODO: Update this test case when we have video multimodal support in vllm official components
221
222
    "multimodal_video_agg": VLLMConfig(
        name="multimodal_video_agg",
223
        directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
224
        script_name="video_agg.sh",
225
        marks=[pytest.mark.gpu_2],
226
227
        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
        delayed_start=0,
228
        script_args=["--model", "llava-hf/LLaVA-NeXT-Video-7B-hf"],
229
        timeout=360,
230
231
232
233
234
235
236
237
238
239
240
241
242
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "Describe the video in detail"},
                    {
                        "type": "video_url",
                        "video_url": {
                            "url": "https://storage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["rabbit"],
243
                temperature=0.7,
244
245
            )
        ],
246
    ),
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
    "multimodal_audio_agg": VLLMConfig(
        name="multimodal_audio_agg",
        directory="/workspace/examples/multimodal",
        script_name="audio_agg.sh",
        marks=[pytest.mark.gpu_2],
        model="Qwen/Qwen2-Audio-7B-Instruct",
        delayed_start=0,
        script_args=["--model", "Qwen/Qwen2-Audio-7B-Instruct"],
        timeout=500,
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is recited in the audio?"},
                    {
                        "type": "audio_url",
                        "audio_url": {
                            "url": "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=[
                    "The original content of this audio is:'yet these thoughts affected Hester Pynne less with hope than apprehension.'"
                ],
                temperature=0.8,
            )
        ],
    ),
275
276
277
    # TODO: Enable this test case when we have 4 GPUs runners.
    # "multimodal_disagg": VLLMConfig(
    #     name="multimodal_disagg",
278
    #     directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
279
280
281
282
    #     script_name="disagg.sh",
    #     marks=[pytest.mark.gpu_4, pytest.mark.vllm],
    #     model="llava-hf/llava-1.5-7b-hf",
    #     delayed_start=45,
283
    #     script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
284
    # ),
285
286
287
}


Alec's avatar
Alec committed
288
@pytest.fixture(params=params_with_model_mark(vllm_configs))
289
290
291
292
293
def vllm_config_test(request):
    """Fixture that provides different vLLM test configurations"""
    return vllm_configs[request.param]


294
@pytest.mark.vllm
295
@pytest.mark.e2e
Alec's avatar
Alec committed
296
297
298
def test_serve_deployment(
    vllm_config_test, request, runtime_services, predownload_models
):
299
300
301
302
    """
    Test dynamo serve deployments with different graph configurations.
    """
    config = vllm_config_test
303
    run_serve_deployment(config, request)