test_vllm.py 6.45 KB
Newer Older
1
2
3
4
5
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import logging
import os
6
from dataclasses import dataclass, field
7
8
9

import pytest

10
11
12
13
14
from tests.serve.common import (
    WORKSPACE_DIR,
    params_with_model_mark,
    run_serve_deployment,
)
15
16
17
18
19
20
from tests.utils.engine_process import EngineConfig
from tests.utils.payload_builder import (
    chat_payload,
    chat_payload_default,
    completion_payload_default,
    metric_payload_default,
21
22
23
24
25
26
)

logger = logging.getLogger(__name__)


@dataclass
27
class VLLMConfig(EngineConfig):
28
29
    """Configuration for vLLM test scenarios"""

30
    stragglers: list[str] = field(default_factory=lambda: ["VLLM:EngineCore"])
31
32


33
vllm_dir = os.environ.get("VLLM_DIR") or os.path.join(
34
    WORKSPACE_DIR, "components/backends/vllm"
35
)
36
37
38
39
40

# vLLM test configurations
vllm_configs = {
    "aggregated": VLLMConfig(
        name="aggregated",
41
        directory=vllm_dir,
42
        script_name="agg.sh",
43
        marks=[pytest.mark.gpu_1],
44
        model="Qwen/Qwen3-0.6B",
45
46
47
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
48
            metric_payload_default(min_num_requests=6, backend="vllm"),
49
        ],
50
    ),
51
52
    "agg-router": VLLMConfig(
        name="agg-router",
53
        directory=vllm_dir,
54
        script_name="agg_router.sh",
55
        marks=[pytest.mark.gpu_2],
56
        model="Qwen/Qwen3-0.6B",
57
58
59
        request_payloads=[
            chat_payload_default(
                expected_log=[
60
                    r"ZMQ listener .* received batch with \d+ events \(seq=\d+(?:, [^)]*)?\)",
61
                    r"Event processor for worker_id \d+ processing event: Stored\(",
Yan Ru Pei's avatar
Yan Ru Pei committed
62
                    r"Selected worker: worker_id=\d+ dp_rank=.*?, logit: ",
63
64
65
66
67
68
                ]
            )
        ],
        env={
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info",
        },
69
    ),
70
71
    "disaggregated": VLLMConfig(
        name="disaggregated",
72
        directory=vllm_dir,
73
        script_name="disagg.sh",
74
        marks=[pytest.mark.gpu_2],
75
        model="Qwen/Qwen3-0.6B",
76
77
78
79
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
80
    ),
81
82
    "deepep": VLLMConfig(
        name="deepep",
83
        directory=vllm_dir,
84
        script_name="dsr1_dep.sh",
85
86
87
88
89
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.vllm,
            pytest.mark.h100,
        ],
90
        model="deepseek-ai/DeepSeek-V2-Lite",
91
        script_args=[
92
93
94
95
96
97
98
99
100
            "--model",
            "deepseek-ai/DeepSeek-V2-Lite",
            "--num-nodes",
            "1",
            "--node-rank",
            "0",
            "--gpus-per-node",
            "2",
        ],
101
        timeout=700,
102
103
104
105
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
106
    ),
107
108
    "multimodal_agg_llava": VLLMConfig(
        name="multimodal_agg_llava",
109
        directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
110
        script_name="agg.sh",
111
        marks=[pytest.mark.gpu_2],
112
        model="llava-hf/llava-1.5-7b-hf",
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
        script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["bus"],
                temperature=0.0,
            )
        ],
130
    ),
131
132
    "multimodal_agg_qwen": VLLMConfig(
        name="multimodal_agg_qwen",
133
        directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
134
        script_name="agg.sh",
135
        marks=[pytest.mark.gpu_2],
136
137
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        delayed_start=0,
138
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
139
        timeout=360,
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["bus"],
            )
        ],
155
    ),
156
157
    "multimodal_video_agg": VLLMConfig(
        name="multimodal_video_agg",
158
        directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
159
        script_name="video_agg.sh",
160
        marks=[pytest.mark.gpu_2],
161
162
        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
        delayed_start=0,
163
        script_args=["--model", "llava-hf/LLaVA-NeXT-Video-7B-hf"],
164
        timeout=360,
165
166
167
168
169
170
171
172
173
174
175
176
177
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "Describe the video in detail"},
                    {
                        "type": "video_url",
                        "video_url": {
                            "url": "https://storage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["rabbit"],
178
                temperature=0.7,
179
180
            )
        ],
181
    ),
182
183
184
    # TODO: Enable this test case when we have 4 GPUs runners.
    # "multimodal_disagg": VLLMConfig(
    #     name="multimodal_disagg",
185
    #     directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
186
187
188
189
    #     script_name="disagg.sh",
    #     marks=[pytest.mark.gpu_4, pytest.mark.vllm],
    #     model="llava-hf/llava-1.5-7b-hf",
    #     delayed_start=45,
190
    #     script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
191
    # ),
192
193
194
}


Alec's avatar
Alec committed
195
@pytest.fixture(params=params_with_model_mark(vllm_configs))
196
197
198
199
200
def vllm_config_test(request):
    """Fixture that provides different vLLM test configurations"""
    return vllm_configs[request.param]


201
@pytest.mark.vllm
202
@pytest.mark.e2e
Alec's avatar
Alec committed
203
204
205
def test_serve_deployment(
    vllm_config_test, request, runtime_services, predownload_models
):
206
207
208
209
    """
    Test dynamo serve deployments with different graph configurations.
    """
    config = vllm_config_test
210
    run_serve_deployment(config, request)