test_vision_openai_server_b.py 8.9 KB
Newer Older
1
2
import unittest

3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from test_vision_openai_server_common import *

from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    popen_launch_server,
)


class TestPixtralServer(TestOpenAIVisionServer):
    @classmethod
    def setUpClass(cls):
        cls.model = "mistral-community/pixtral-12b"
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=[
                "--trust-remote-code",
                "--mem-fraction-static",
25
                "0.70",
26
27
                "--cuda-graph-max-bs",
                "4",
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
            ],
        )
        cls.base_url += "/v1"

    def test_video_chat_completion(self):
        pass


class TestMistral3_1Server(TestOpenAIVisionServer):
    @classmethod
    def setUpClass(cls):
        cls.model = "unsloth/Mistral-Small-3.1-24B-Instruct-2503"
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=[
                "--trust-remote-code",
                "--mem-fraction-static",
49
                "0.75",
50
51
                "--cuda-graph-max-bs",
                "4",
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
            ],
        )
        cls.base_url += "/v1"

    def test_video_chat_completion(self):
        pass


class TestDeepseekVL2Server(TestOpenAIVisionServer):
    @classmethod
    def setUpClass(cls):
        cls.model = "deepseek-ai/deepseek-vl2-small"
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=[
                "--trust-remote-code",
                "--context-length",
                "4096",
74
75
                "--cuda-graph-max-bs",
                "4",
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
            ],
        )
        cls.base_url += "/v1"

    def test_video_chat_completion(self):
        pass


class TestJanusProServer(TestOpenAIVisionServer):
    @classmethod
    def setUpClass(cls):
        cls.model = "deepseek-ai/Janus-Pro-7B"
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=[
                "--trust-remote-code",
                "--mem-fraction-static",
97
                "0.35",
98
99
                "--cuda-graph-max-bs",
                "4",
100
101
102
103
            ],
        )
        cls.base_url += "/v1"

104
    def test_video_images_chat_completion(self):
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
        pass

    def test_single_image_chat_completion(self):
        # Skip this test because it is flaky
        pass


## Skip for ci test
# class TestLlama4Server(TestOpenAIVisionServer):
#     @classmethod
#     def setUpClass(cls):
#         cls.model = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
#         cls.base_url = DEFAULT_URL_FOR_TEST
#         cls.api_key = "sk-123456"
#         cls.process = popen_launch_server(
#             cls.model,
#             cls.base_url,
#             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
#             other_args=[
#                 "--chat-template",
#                 "llama-4",
#                 "--mem-fraction-static",
#                 "0.8",
#                 "--tp-size=8",
#                 "--context-length=8192",
130
131
132
133
#                 "--mm-attention-backend",
#                 "fa3",
#                 "--cuda-graph-max-bs",
#                 "4",
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#             ],
#         )
#         cls.base_url += "/v1"

#     def test_video_chat_completion(self):
#         pass


class TestGemma3itServer(TestOpenAIVisionServer):
    @classmethod
    def setUpClass(cls):
        cls.model = "google/gemma-3-4b-it"
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=[
                "--trust-remote-code",
                "--mem-fraction-static",
155
                "0.70",
156
                "--enable-multimodal",
157
158
                "--cuda-graph-max-bs",
                "4",
159
160
161
162
163
164
165
166
            ],
        )
        cls.base_url += "/v1"

    def test_video_chat_completion(self):
        pass


167
168
169
class TestGemma3nServer(TestOpenAIVisionServer):
    @classmethod
    def setUpClass(cls):
170
        cls.model = "google/gemma-3n-E4B-it"
171
172
173
174
175
176
177
178
179
180
181
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=[
                "--trust-remote-code",
                "--mem-fraction-static",
                "0.70",
                "--cuda-graph-max-bs",
182
                "4",
183
184
185
186
            ],
        )
        cls.base_url += "/v1"

187
188
189
190
191
    def test_audio_chat_completion(self):
        self._test_audio_speech_completion()
        # This _test_audio_ambient_completion test is way too complicated to pass for a small LLM
        # self._test_audio_ambient_completion()

192

Stefan He's avatar
Stefan He committed
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
class TestKimiVLServer(TestOpenAIVisionServer):
    @classmethod
    def setUpClass(cls):
        cls.model = "moonshotai/Kimi-VL-A3B-Instruct"
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=[
                "--trust-remote-code",
                "--context-length",
                "4096",
                "--dtype",
                "bfloat16",
209
210
                "--cuda-graph-max-bs",
                "4",
Stefan He's avatar
Stefan He committed
211
212
213
            ],
        )
        cls.base_url += "/v1"
214

Stefan He's avatar
Stefan He committed
215
216
    def test_video_images_chat_completion(self):
        pass
217
218


219
220
221
class TestPhi4MMServer(TestOpenAIVisionServer):
    @classmethod
    def setUpClass(cls):
222
223
224
225
226
227
228
229
        # Manually download LoRA adapter_config.json as it's not downloaded by the model loader by default.
        from huggingface_hub import constants, snapshot_download

        snapshot_download(
            "microsoft/Phi-4-multimodal-instruct",
            allow_patterns=["**/adapter_config.json"],
        )

230
231
232
        cls.model = "microsoft/Phi-4-multimodal-instruct"
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
233
234

        revision = "33e62acdd07cd7d6635badd529aa0a3467bb9c6a"
235
236
237
238
239
240
241
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=[
                "--trust-remote-code",
                "--mem-fraction-static",
242
                "0.70",
243
244
                "--disable-radix-cache",
                "--max-loras-per-batch",
245
                "2",
246
247
248
249
                "--revision",
                revision,
                "--lora-paths",
                f"vision={constants.HF_HUB_CACHE}/models--microsoft--Phi-4-multimodal-instruct/snapshots/{revision}/vision-lora",
250
                f"speech={constants.HF_HUB_CACHE}/models--microsoft--Phi-4-multimodal-instruct/snapshots/{revision}/speech-lora",
251
252
                "--cuda-graph-max-bs",
                "4",
253
254
255
256
            ],
        )
        cls.base_url += "/v1"

257
    def get_vision_request_kwargs(self):
258
259
260
261
262
263
264
        return {
            "extra_body": {
                "lora_path": "vision",
                "top_k": 1,
                "top_p": 1.0,
            }
        }
265

266
267
268
269
270
271
272
273
274
275
276
    def get_audio_request_kwargs(self):
        return {
            "extra_body": {
                "lora_path": "speech",
                "top_k": 1,
                "top_p": 1.0,
            }
        }

    def test_audio_chat_completion(self):
        self._test_audio_speech_completion()
277
        # This _test_audio_ambient_completion test is way too complicated to pass for a small LLM
278
        # self._test_audio_ambient_completion()
279
280


Zijian's avatar
Zijian committed
281
282
283
class TestVILAServer(TestOpenAIVisionServer):
    @classmethod
    def setUpClass(cls):
284
        cls.model = "Efficient-Large-Model/NVILA-Lite-2B-hf-0626"
Zijian's avatar
Zijian committed
285
286
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
287
        cls.revision = "6bde1de5964b40e61c802b375fff419edc867506"
Zijian's avatar
Zijian committed
288
289
290
291
292
293
294
295
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            api_key=cls.api_key,
            other_args=[
                "--trust-remote-code",
                "--context-length=65536",
296
                f"--revision={cls.revision}",
297
298
                "--cuda-graph-max-bs",
                "4",
Zijian's avatar
Zijian committed
299
300
301
302
303
            ],
        )
        cls.base_url += "/v1"


304
if __name__ == "__main__":
305
    del TestOpenAIVisionServer
306
    unittest.main()