test_vision_openai_server_a.py 10.3 KB
Newer Older
1
2
3
4
5
6
"""
Usage:
python3 -m unittest test_vision_openai_server.TestOpenAIVisionServer.test_mixed_batch
python3 -m unittest test_vision_openai_server.TestOpenAIVisionServer.test_multi_images_chat_completion
"""

7
8
import unittest

9
10
11
12
13
14
15
16
from test_vision_openai_server_common import *

from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    popen_launch_server,
)


17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
class TestLlava(ImageOpenAITestMixin):
    @classmethod
    def setUpClass(cls):
        cls.model = "lmms-lab/llava-onevision-qwen2-0.5b-ov"
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            api_key=cls.api_key,
        )
        cls.base_url += "/v1"


class TestQwen2VLServer(ImageOpenAITestMixin, VideoOpenAITestMixin):
33
34
35
36
37
38
39
40
41
42
43
44
    @classmethod
    def setUpClass(cls):
        cls.model = "Qwen/Qwen2-VL-7B-Instruct"
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            api_key=cls.api_key,
            other_args=[
                "--mem-fraction-static",
45
                "0.35",
46
47
                "--cuda-graph-max-bs",
                "4",
48
49
50
51
52
            ],
        )
        cls.base_url += "/v1"


53
class TestQwen2_5_VLServer(ImageOpenAITestMixin, VideoOpenAITestMixin):
54
55
56
57
58
59
60
61
62
63
64
65
    @classmethod
    def setUpClass(cls):
        cls.model = "Qwen/Qwen2.5-VL-7B-Instruct"
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            api_key=cls.api_key,
            other_args=[
                "--mem-fraction-static",
66
                "0.35",
67
68
                "--cuda-graph-max-bs",
                "4",
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
            ],
        )
        cls.base_url += "/v1"


class TestVLMContextLengthIssue(CustomTestCase):
    @classmethod
    def setUpClass(cls):
        cls.model = "Qwen/Qwen2-VL-7B-Instruct"
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            api_key=cls.api_key,
            other_args=[
                "--context-length",
                "300",
88
                "--mem-fraction-static=0.75",
89
90
                "--cuda-graph-max-bs",
                "4",
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
            ],
        )
        cls.base_url += "/v1"

    @classmethod
    def tearDownClass(cls):
        kill_process_tree(cls.process.pid)

    def test_single_image_chat_completion(self):
        client = openai.Client(api_key=self.api_key, base_url=self.base_url)

        with self.assertRaises(openai.BadRequestError) as cm:
            client.chat.completions.create(
                model="default",
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "image_url",
                                "image_url": {"url": IMAGE_MAN_IRONING_URL},
                            },
                            {
                                "type": "text",
                                "text": "Give a lengthy description of this picture",
                            },
                        ],
                    },
                ],
                temperature=0,
            )

        # context length is checked first, then max_req_input_len, which is calculated from the former
        assert (
            "Multimodal prompt is too long after expanding multimodal tokens."
            in str(cm.exception)
            or "is longer than the model's context length" in str(cm.exception)
        )


131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# Note(Xinyuan): mllama is not stable for now, skip for CI
# class TestMllamaServer(TestOpenAIVisionServer):
#     @classmethod
#     def setUpClass(cls):
#         cls.model = "meta-llama/Llama-3.2-11B-Vision-Instruct"
#         cls.base_url = DEFAULT_URL_FOR_TEST
#         cls.api_key = "sk-123456"
#         cls.process = popen_launch_server(
#             cls.model,
#             cls.base_url,
#             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
#             api_key=cls.api_key,
#         )
#         cls.base_url += "/v1"

146

147
class TestMinicpmvServer(ImageOpenAITestMixin):
148
149
150
151
152
153
154
155
156
157
158
159
    @classmethod
    def setUpClass(cls):
        cls.model = "openbmb/MiniCPM-V-2_6"
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=[
                "--trust-remote-code",
                "--mem-fraction-static",
160
                "0.35",
161
162
                "--cuda-graph-max-bs",
                "4",
163
164
165
166
167
            ],
        )
        cls.base_url += "/v1"


tc-mb's avatar
tc-mb committed
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
class TestMinicpmv4Server(ImageOpenAITestMixin):
    @classmethod
    def setUpClass(cls):
        cls.model = "openbmb/MiniCPM-V-4"
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=[
                "--trust-remote-code",
                "--mem-fraction-static",
                "0.35",
                "--cuda-graph-max-bs",
                "4",
            ],
        )
        cls.base_url += "/v1"


189
class TestInternVL2_5Server(ImageOpenAITestMixin):
190
191
192
193
194
195
196
197
198
    @classmethod
    def setUpClass(cls):
        cls.model = "OpenGVLab/InternVL2_5-2B"
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
199
200
201
202
203
            other_args=[
                "--trust-remote-code",
                "--cuda-graph-max-bs",
                "4",
            ],
204
205
206
207
        )
        cls.base_url += "/v1"


tc-mb's avatar
tc-mb committed
208
class TestMinicpmo2_6Server(ImageOpenAITestMixin, AudioOpenAITestMixin):
209
210
211
212
213
214
215
216
217
218
219
220
    @classmethod
    def setUpClass(cls):
        cls.model = "openbmb/MiniCPM-o-2_6"
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=[
                "--trust-remote-code",
                "--mem-fraction-static",
221
                "0.65",
222
223
                "--cuda-graph-max-bs",
                "4",
224
225
226
227
228
            ],
        )
        cls.base_url += "/v1"


229
class TestMimoVLServer(ImageOpenAITestMixin):
Xinyuan Tong's avatar
Xinyuan Tong committed
230
231
232
233
234
235
236
237
238
239
240
241
242
243
    @classmethod
    def setUpClass(cls):
        cls.model = "XiaomiMiMo/MiMo-VL-7B-RL"
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            api_key=cls.api_key,
            other_args=[
                "--trust-remote-code",
                "--mem-fraction-static",
                "0.6",
244
245
                "--cuda-graph-max-bs",
                "4",
Xinyuan Tong's avatar
Xinyuan Tong committed
246
247
248
249
250
            ],
        )
        cls.base_url += "/v1"


251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
class TestVILAServer(ImageOpenAITestMixin):
    @classmethod
    def setUpClass(cls):
        cls.model = "Efficient-Large-Model/NVILA-Lite-2B-hf-0626"
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
        cls.revision = "6bde1de5964b40e61c802b375fff419edc867506"
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            api_key=cls.api_key,
            other_args=[
                "--trust-remote-code",
                "--context-length=65536",
                f"--revision={cls.revision}",
                "--cuda-graph-max-bs",
                "4",
            ],
        )
        cls.base_url += "/v1"


class TestPhi4MMServer(ImageOpenAITestMixin, AudioOpenAITestMixin):
    @classmethod
    def setUpClass(cls):
        # Manually download LoRA adapter_config.json as it's not downloaded by the model loader by default.
        from huggingface_hub import constants, snapshot_download

        snapshot_download(
            "microsoft/Phi-4-multimodal-instruct",
            allow_patterns=["**/adapter_config.json"],
        )

        cls.model = "microsoft/Phi-4-multimodal-instruct"
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"

        revision = "33e62acdd07cd7d6635badd529aa0a3467bb9c6a"
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=[
                "--trust-remote-code",
                "--mem-fraction-static",
                "0.70",
                "--disable-radix-cache",
                "--max-loras-per-batch",
                "2",
                "--revision",
                revision,
                "--lora-paths",
                f"vision={constants.HF_HUB_CACHE}/models--microsoft--Phi-4-multimodal-instruct/snapshots/{revision}/vision-lora",
                f"speech={constants.HF_HUB_CACHE}/models--microsoft--Phi-4-multimodal-instruct/snapshots/{revision}/speech-lora",
                "--cuda-graph-max-bs",
                "4",
            ],
        )
        cls.base_url += "/v1"

    def get_vision_request_kwargs(self):
        return {
            "extra_body": {
                "lora_path": "vision",
                "top_k": 1,
                "top_p": 1.0,
            }
        }

    def get_audio_request_kwargs(self):
        return {
            "extra_body": {
                "lora_path": "speech",
                "top_k": 1,
                "top_p": 1.0,
            }
        }

    # This _test_audio_ambient_completion test is way too complicated to pass for a small LLM
    def test_audio_ambient_completion(self):
        pass


335
if __name__ == "__main__":
336
337
338
339
340
341
    del (
        TestOpenAIOmniServerBase,
        ImageOpenAITestMixin,
        VideoOpenAITestMixin,
        AudioOpenAITestMixin,
    )
342
    unittest.main()