test_vllm.py 6.24 KB
Newer Older
1
2
3
4
5
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import logging
import os
6
from dataclasses import dataclass, field
7
8
9

import pytest

Alec's avatar
Alec committed
10
from tests.serve.common import params_with_model_mark, run_serve_deployment
11
12
13
14
15
16
from tests.utils.engine_process import EngineConfig
from tests.utils.payload_builder import (
    chat_payload,
    chat_payload_default,
    completion_payload_default,
    metric_payload_default,
17
18
19
20
21
22
)

logger = logging.getLogger(__name__)


@dataclass
23
class VLLMConfig(EngineConfig):
24
25
    """Configuration for vLLM test scenarios"""

26
    stragglers: list[str] = field(default_factory=lambda: ["VLLM:EngineCore"])
27
28


29
vllm_dir = os.environ.get("VLLM_DIR", "/workspace/components/backends/vllm")
30
31
32
33
34

# vLLM test configurations
vllm_configs = {
    "aggregated": VLLMConfig(
        name="aggregated",
35
        directory=vllm_dir,
36
        script_name="agg.sh",
37
        marks=[pytest.mark.gpu_1],
38
        model="Qwen/Qwen3-0.6B",
39
40
41
42
43
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
            metric_payload_default(min_num_requests=6),
        ],
44
    ),
45
46
    "agg-router": VLLMConfig(
        name="agg-router",
47
        directory=vllm_dir,
48
        script_name="agg_router.sh",
49
        marks=[pytest.mark.gpu_2],
50
        model="Qwen/Qwen3-0.6B",
51
52
53
54
55
56
57
58
59
60
61
62
        request_payloads=[
            chat_payload_default(
                expected_log=[
                    r"ZMQ listener .* received batch with \d+ events \(seq=\d+\)",
                    r"Event processor for worker_id \d+ processing event: Stored\(",
                    r"Selected worker: \d+, logit: ",
                ]
            )
        ],
        env={
            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info",
        },
63
    ),
64
65
    "disaggregated": VLLMConfig(
        name="disaggregated",
66
        directory=vllm_dir,
67
        script_name="disagg.sh",
68
        marks=[pytest.mark.gpu_2],
69
        model="Qwen/Qwen3-0.6B",
70
71
72
73
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
74
    ),
75
76
    "deepep": VLLMConfig(
        name="deepep",
77
        directory=vllm_dir,
78
        script_name="dsr1_dep.sh",
79
80
81
82
83
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.vllm,
            pytest.mark.h100,
        ],
84
        model="deepseek-ai/DeepSeek-V2-Lite",
85
        script_args=[
86
87
88
89
90
91
92
93
94
            "--model",
            "deepseek-ai/DeepSeek-V2-Lite",
            "--num-nodes",
            "1",
            "--node-rank",
            "0",
            "--gpus-per-node",
            "2",
        ],
95
        timeout=700,
96
97
98
99
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
        ],
100
    ),
101
102
    "multimodal_agg_llava": VLLMConfig(
        name="multimodal_agg_llava",
103
        directory="/workspace/examples/multimodal",
104
        script_name="agg.sh",
105
        marks=[pytest.mark.gpu_2],
106
        model="llava-hf/llava-1.5-7b-hf",
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
        script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["bus"],
                temperature=0.0,
            )
        ],
124
    ),
125
126
127
128
    "multimodal_agg_qwen": VLLMConfig(
        name="multimodal_agg_qwen",
        directory="/workspace/examples/multimodal",
        script_name="agg.sh",
129
        marks=[pytest.mark.gpu_2],
130
131
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        delayed_start=0,
132
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
133
        timeout=360,
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "What is in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["bus"],
            )
        ],
149
    ),
150
151
152
153
    "multimodal_video_agg": VLLMConfig(
        name="multimodal_video_agg",
        directory="/workspace/examples/multimodal",
        script_name="video_agg.sh",
154
        marks=[pytest.mark.gpu_2],
155
156
        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
        delayed_start=0,
157
        script_args=["--model", "llava-hf/LLaVA-NeXT-Video-7B-hf"],
158
        timeout=360,
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
        request_payloads=[
            chat_payload(
                [
                    {"type": "text", "text": "Describe the video in detail"},
                    {
                        "type": "video_url",
                        "video_url": {
                            "url": "https://storage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4"
                        },
                    },
                ],
                repeat_count=1,
                expected_response=["rabbit"],
            )
        ],
174
    ),
175
176
177
    # TODO: Enable this test case when we have 4 GPUs runners.
    # "multimodal_disagg": VLLMConfig(
    #     name="multimodal_disagg",
178
    #     directory="/workspace/examples/multimodal",
179
180
181
182
    #     script_name="disagg.sh",
    #     marks=[pytest.mark.gpu_4, pytest.mark.vllm],
    #     model="llava-hf/llava-1.5-7b-hf",
    #     delayed_start=45,
183
    #     script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
184
    # ),
185
186
187
}


Alec's avatar
Alec committed
188
@pytest.fixture(params=params_with_model_mark(vllm_configs))
189
190
191
192
193
def vllm_config_test(request):
    """Fixture that provides different vLLM test configurations"""
    return vllm_configs[request.param]


194
@pytest.mark.vllm
195
@pytest.mark.e2e
Alec's avatar
Alec committed
196
197
198
def test_serve_deployment(
    vllm_config_test, request, runtime_services, predownload_models
):
199
200
201
202
    """
    Test dynamo serve deployments with different graph configurations.
    """
    config = vllm_config_test
203
    run_serve_deployment(config, request)