test_vllm.py 10.3 KB
Newer Older
1
2
3
4
5
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import logging
import os
6
from dataclasses import dataclass, field
7
8
9

import pytest

10
11
12
13
14
from tests.serve.common import (
    WORKSPACE_DIR,
    params_with_model_mark,
    run_serve_deployment,
)
15
16
17
18
19
20
from tests.utils.engine_process import EngineConfig
from tests.utils.payload_builder import (
    chat_payload,
    chat_payload_default,
    completion_payload_default,
    metric_payload_default,
21
22
23
24
25
26
)

logger = logging.getLogger(__name__)


@dataclass
27
class VLLMConfig(EngineConfig):
28
29
    """Configuration for vLLM test scenarios"""

30
    stragglers: list[str] = field(default_factory=lambda: ["VLLM:EngineCore"])
31
32


33
vllm_dir = os.environ.get("VLLM_DIR") or os.path.join(
34
    WORKSPACE_DIR, "examples/backends/vllm"
35
)
36
37
38
39
40

# vLLM test configurations
vllm_configs = {
    "aggregated": VLLMConfig(
        name="aggregated",
41
        directory=vllm_dir,
42
        script_name="agg.sh",
43
        marks=[pytest.mark.gpu_1],
44
        model="Qwen/Qwen3-0.6B",
45
46
47
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
48
            metric_payload_default(min_num_requests=6, backend="vllm"),
49
        ],
50
    ),
51
52
53
54
55
56
57
58
59
60
61
62
63
    "aggregated_lmcache": VLLMConfig(
        name="aggregated_lmcache",
        directory=vllm_dir,
        script_name="agg_lmcache.sh",
        marks=[pytest.mark.gpu_1],
        model="Qwen/Qwen3-0.6B",
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
            metric_payload_default(min_num_requests=6, backend="vllm"),
            metric_payload_default(min_num_requests=6, backend="lmcache"),
        ],
    ),
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
    "agg-request-plane-tcp": VLLMConfig(
        name="agg-request-plane-tcp",
        directory=vllm_dir,
        script_name="agg_request_planes.sh",
        marks=[pytest.mark.gpu_1],
        model="Qwen/Qwen3-0.6B",
        script_args=["--tcp"],
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
    ),
    "agg-request-plane-http": VLLMConfig(
        name="agg-request-plane-http",
        directory=vllm_dir,
        script_name="agg_request_planes.sh",
        marks=[pytest.mark.gpu_1],
        model="Qwen/Qwen3-0.6B",
        script_args=["--http"],
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
    ),
88
89
    "agg-router": VLLMConfig(
        name="agg-router",
90
        directory=vllm_dir,
91
        script_name="agg_router.sh",
92
        marks=[pytest.mark.gpu_2],
93
        model="Qwen/Qwen3-0.6B",
94
95
96
        request_payloads=[
            chat_payload_default(
                expected_log=[
97
                    r"ZMQ listener .* received batch with \d+ events \(seq=\d+(?:, [^)]*)?\)",
98
                    r"Event processor for worker_id \d+ processing event: Stored\(",
Yan Ru Pei's avatar
Yan Ru Pei committed
99
                    r"Selected worker: worker_id=\d+ dp_rank=.*?, logit: ",
100
101
102
103
104
105
                ]
            )
        ],
        env={
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info",
        },
106
    ),
107
108
    "disaggregated": VLLMConfig(
        name="disaggregated",
109
        directory=vllm_dir,
110
        script_name="disagg.sh",
111
        marks=[pytest.mark.gpu_2],
112
        model="Qwen/Qwen3-0.6B",
113
114
115
116
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
117
    ),
118
119
    "deepep": VLLMConfig(
        name="deepep",
120
        directory=vllm_dir,
121
        script_name="dsr1_dep.sh",
122
123
124
125
126
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.vllm,
            pytest.mark.h100,
        ],
127
        model="deepseek-ai/DeepSeek-V2-Lite",
128
        script_args=[
129
130
131
132
133
134
135
136
137
            "--model",
            "deepseek-ai/DeepSeek-V2-Lite",
            "--num-nodes",
            "1",
            "--node-rank",
            "0",
            "--gpus-per-node",
            "2",
        ],
138
        timeout=700,
139
        request_payloads=[
Richard Huo's avatar
Richard Huo committed
140
141
            chat_payload_default(expected_response=["joke"]),
            completion_payload_default(expected_response=["joke"]),
142
        ],
143
    ),
144
145
    "multimodal_agg_llava_epd": VLLMConfig(
        name="multimodal_agg_llava_epd",
146
        directory=vllm_dir,
147
        script_name="agg_multimodal_epd.sh",
148
        marks=[pytest.mark.gpu_2],
149
        model="llava-hf/llava-1.5-7b-hf",
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
        script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["bus"],
                temperature=0.0,
            )
        ],
167
    ),
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
    "multimodal_agg_qwen_epd": VLLMConfig(
        name="multimodal_agg_qwen_epd",
        directory=vllm_dir,
        script_name="agg_multimodal_epd.sh",
        marks=[pytest.mark.gpu_2],
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        delayed_start=0,
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
        timeout=360,
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["bus"],
            )
        ],
    ),
193
194
    "multimodal_agg_qwen": VLLMConfig(
        name="multimodal_agg_qwen",
195
196
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
197
        marks=[pytest.mark.gpu_2],
198
        model="Qwen/Qwen2.5-VL-7B-Instruct",
199
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
200
        delayed_start=0,
201
        timeout=360,
202
        request_payloads=[
203
            # HTTP URL test
204
205
206
207
208
209
210
211
212
213
214
215
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["bus"],
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
            ),
            # Base64 data URL test (1x1 PNG inline, avoids network fetch)
            chat_payload(
                [
                    {"type": "text", "text": "What do you see in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAAAAAA6fptVAAAACklEQVR4nGNoAAAAggCBd81ytgAAAABJRU5ErkJggg=="
                        },
                    },
                ],
                repeat_count=1,
                expected_response=[],  # Just validate no error
            ),
231
        ],
232
    ),
233
    # TODO: Update this test case when we have video multimodal support in vllm official components
234
235
    "multimodal_video_agg": VLLMConfig(
        name="multimodal_video_agg",
236
        directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
237
        script_name="video_agg.sh",
238
        marks=[pytest.mark.gpu_2],
239
240
        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
        delayed_start=0,
241
        script_args=["--model", "llava-hf/LLaVA-NeXT-Video-7B-hf"],
242
        timeout=360,
243
244
245
246
247
248
249
250
251
252
253
254
255
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "Describe the video in detail"},
                    {
                        "type": "video_url",
                        "video_url": {
                            "url": "https://storage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["rabbit"],
256
                temperature=0.7,
257
258
            )
        ],
259
    ),
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
    "multimodal_audio_agg": VLLMConfig(
        name="multimodal_audio_agg",
        directory="/workspace/examples/multimodal",
        script_name="audio_agg.sh",
        marks=[pytest.mark.gpu_2],
        model="Qwen/Qwen2-Audio-7B-Instruct",
        delayed_start=0,
        script_args=["--model", "Qwen/Qwen2-Audio-7B-Instruct"],
        timeout=500,
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is recited in the audio?"},
                    {
                        "type": "audio_url",
                        "audio_url": {
                            "url": "https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=[
                    "The original content of this audio is:'yet these thoughts affected Hester Pynne less with hope than apprehension.'"
                ],
                temperature=0.8,
            )
        ],
    ),
288
289
290
    # TODO: Enable this test case when we have 4 GPUs runners.
    # "multimodal_disagg": VLLMConfig(
    #     name="multimodal_disagg",
291
    #     directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
292
293
294
295
    #     script_name="disagg.sh",
    #     marks=[pytest.mark.gpu_4, pytest.mark.vllm],
    #     model="llava-hf/llava-1.5-7b-hf",
    #     delayed_start=45,
296
    #     script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
297
    # ),
298
299
300
}


Alec's avatar
Alec committed
301
@pytest.fixture(params=params_with_model_mark(vllm_configs))
302
303
304
305
306
def vllm_config_test(request):
    """Fixture that provides different vLLM test configurations"""
    return vllm_configs[request.param]


307
@pytest.mark.vllm
308
@pytest.mark.e2e
Alec's avatar
Alec committed
309
310
311
def test_serve_deployment(
    vllm_config_test, request, runtime_services, predownload_models
):
312
313
314
315
    """
    Test dynamo serve deployments with different graph configurations.
    """
    config = vllm_config_test
316
    run_serve_deployment(config, request)