Add 4-GPU runner tests and split existing tests (#6383)

f11481b9 · fzyzcjy · GitHub · 9d24c3ff · f11481b9 · f11481b9
Unverified Commit f11481b9 authored May 19, 2025 by fzyzcjy Committed by GitHub May 18, 2025
6 changed files
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -89,6 +89,25 @@ jobs:
          cd test/srt
          python3 run_suite.py --suite per-commit-2-gpu
+  unittest-test-backend-4-gpu:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    needs: [unit-test-frontend, unit-test-backend-2-gpu]
+    runs-on: 4-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - name: Install dependencies
+        run: |
+          bash scripts/ci_install_dependency.sh
+      - name: Run test
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite per-commit-4-gpu
  unittest-test-backend-8-gpu:
    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
        github.event.pull_request.draft == false

--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -81,7 +81,8 @@ suites = {
        TestFile("test_vertex_endpoint.py", 31),
        TestFile("test_vision_chunked_prefill.py", 175),
        TestFile("test_vlm_accuracy.py", 60),
-        TestFile("test_vision_openai_server.py", 637),
+        TestFile("test_vision_openai_server_a.py", 700),
+        TestFile("test_vision_openai_server_b.py", 700),
        TestFile("test_w8a8_quantization.py", 46),
        TestFile("models/lora/test_lora_cuda_graph.py", 250),
    ],
@@ -104,17 +105,19 @@ suites = {
    "per-commit-2-gpu-amd": [
        TestFile("test_mla_tp.py", 170),
    ],
+    "per-commit-4-gpu": [
+        TestFile("test_local_attn.py", 250),
+        TestFile("test_pp_single_node.py", 150),
+    ],
    "per-commit-8-gpu": [
        # Disabled deepep tests temporarily because it takes too much time.
        # TODO: re-enable them after reducing the test time with compilation cache and smaller models.
        # TestFile("test_deepep_intranode.py", 50),
        # TestFile("test_deepep_low_latency.py", 50),
        # TestFile("test_moe_deepep_eval_accuracy_large.py", 250),
-        TestFile("test_disaggregation.py", 210),
+        # TestFile("test_disaggregation.py", 210), # disabled since we have different_tp test
-        TestFile("test_local_attn.py", 250),
        TestFile("test_disaggregation_different_tp.py", 210),
        TestFile("test_full_deepseek_v3.py", 250),
-        TestFile("test_pp_single_node.py", 150),
    ],
    "per-commit-8-gpu-amd": [
        TestFile("test_full_deepseek_v3.py", 250),

--- a/test/srt/test_pp_single_node.py
+++ b/test/srt/test_pp_single_node.py
@@ -34,7 +34,7 @@ class TestPPAccuracy(unittest.TestCase):
                "--tp-size",
                2,
                "--pp-size",
-                4,
+                2,
                "--chunked-prefill-size",
                256,
            ],

--- a/test/srt/test_vision_openai_server_a.py
+++ b/test/srt/test_vision_openai_server_a.py
+"""
+Usage:
+python3 -m unittest test_vision_openai_server.TestOpenAIVisionServer.test_mixed_batch
+python3 -m unittest test_vision_openai_server.TestOpenAIVisionServer.test_multi_images_chat_completion
+"""
+from test_vision_openai_server_common import *
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+class TestQwen2VLServer(TestOpenAIVisionServer):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "Qwen/Qwen2-VL-7B-Instruct"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=[
+                "--mem-fraction-static",
+                "0.4",
+            ],
+        )
+        cls.base_url += "/v1"
+class TestQwen2_5_VLServer(TestOpenAIVisionServer):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "Qwen/Qwen2.5-VL-7B-Instruct"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=[
+                "--mem-fraction-static",
+                "0.4",
+            ],
+        )
+        cls.base_url += "/v1"
+class TestVLMContextLengthIssue(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "Qwen/Qwen2-VL-7B-Instruct"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=[
+                "--context-length",
+                "300",
+                "--mem-fraction-static=0.80",
+            ],
+        )
+        cls.base_url += "/v1"
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+    def test_single_image_chat_completion(self):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        with self.assertRaises(openai.BadRequestError) as cm:
+            client.chat.completions.create(
+                model="default",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": IMAGE_MAN_IRONING_URL},
+                            },
+                            {
+                                "type": "text",
+                                "text": "Give a lengthy description of this picture",
+                            },
+                        ],
+                    },
+                ],
+                temperature=0,
+            )
+        # context length is checked first, then max_req_input_len, which is calculated from the former
+        assert (
+            "Multimodal prompt is too long after expanding multimodal tokens."
+            in str(cm.exception)
+            or "is longer than the model's context length" in str(cm.exception)
+        )
+class TestMllamaServer(TestOpenAIVisionServer):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+        )
+        cls.base_url += "/v1"
+    def test_video_chat_completion(self):
+        pass
+class TestMinicpmvServer(TestOpenAIVisionServer):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "openbmb/MiniCPM-V-2_6"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--mem-fraction-static",
+                "0.4",
+            ],
+        )
+        cls.base_url += "/v1"
+class TestInternVL2_5Server(TestOpenAIVisionServer):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "OpenGVLab/InternVL2_5-2B"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=["--trust-remote-code"],
+        )
+        cls.base_url += "/v1"
+class TestMinicpmoServer(TestOpenAIVisionServer):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "openbmb/MiniCPM-o-2_6"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--mem-fraction-static",
+                "0.7",
+            ],
+        )
+        cls.base_url += "/v1"
+    def test_audio_chat_completion(self):
+        self._test_audio_speech_completion()
+        self._test_audio_ambient_completion()
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_vision_openai_server_b.py
+++ b/test/srt/test_vision_openai_server_b.py
+from test_vision_openai_server_common import *
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+class TestPixtralServer(TestOpenAIVisionServer):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "mistral-community/pixtral-12b"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--mem-fraction-static",
+                "0.73",
+            ],
+        )
+        cls.base_url += "/v1"
+    def test_video_chat_completion(self):
+        pass
+class TestMistral3_1Server(TestOpenAIVisionServer):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "unsloth/Mistral-Small-3.1-24B-Instruct-2503"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--mem-fraction-static",
+                "0.8",
+            ],
+        )
+        cls.base_url += "/v1"
+    def test_video_chat_completion(self):
+        pass
+class TestDeepseekVL2Server(TestOpenAIVisionServer):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "deepseek-ai/deepseek-vl2-small"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--context-length",
+                "4096",
+            ],
+        )
+        cls.base_url += "/v1"
+    def test_video_chat_completion(self):
+        pass
+class TestDeepseekVL2TinyServer(TestOpenAIVisionServer):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "deepseek-ai/deepseek-vl2-tiny"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--context-length",
+                "4096",
+            ],
+        )
+        cls.base_url += "/v1"
+    def test_video_chat_completion(self):
+        pass
+class TestJanusProServer(TestOpenAIVisionServer):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "deepseek-ai/Janus-Pro-7B"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--mem-fraction-static",
+                "0.4",
+            ],
+        )
+        cls.base_url += "/v1"
+    def test_video_chat_completion(self):
+        pass
+    def test_single_image_chat_completion(self):
+        # Skip this test because it is flaky
+        pass
+## Skip for ci test
+# class TestLlama4Server(TestOpenAIVisionServer):
+#     @classmethod
+#     def setUpClass(cls):
+#         cls.model = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+#         cls.base_url = DEFAULT_URL_FOR_TEST
+#         cls.api_key = "sk-123456"
+#         cls.process = popen_launch_server(
+#             cls.model,
+#             cls.base_url,
+#             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+#             other_args=[
+#                 "--chat-template",
+#                 "llama-4",
+#                 "--mem-fraction-static",
+#                 "0.8",
+#                 "--tp-size=8",
+#                 "--context-length=8192",
+#             ],
+#         )
+#         cls.base_url += "/v1"
+#     def test_video_chat_completion(self):
+#         pass
+class TestGemma3itServer(TestOpenAIVisionServer):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "google/gemma-3-4b-it"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--mem-fraction-static",
+                "0.75",
+                "--enable-multimodal",
+            ],
+        )
+        cls.base_url += "/v1"
+    def test_video_chat_completion(self):
+        pass
+class TestKimiVLServer(TestOpenAIVisionServer):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "moonshotai/Kimi-VL-A3B-Instruct"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--context-length",
+                "4096",
+                "--dtype",
+                "bfloat16",
+            ],
+        )
+        cls.base_url += "/v1"
+    def test_video_chat_completion(self):
+        pass
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_vision_openai_server.py
+++ b/test/srt/test_vision_openai_server.py
-"""
-Usage:
-python3 -m unittest test_vision_openai_server.TestOpenAIVisionServer.test_mixed_batch
-python3 -m unittest test_vision_openai_server.TestOpenAIVisionServer.test_multi_images_chat_completion
-"""
 import base64
 import io
 import json
@@ -472,362 +466,3 @@ class TestOpenAIVisionServer(CustomTestCase):
    def test_audio_chat_completion(self):
        pass
-class TestQwen2VLServer(TestOpenAIVisionServer):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = "Qwen/Qwen2-VL-7B-Instruct"
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.api_key = "sk-123456"
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            api_key=cls.api_key,
-            other_args=[
-                "--mem-fraction-static",
-                "0.4",
-            ],
-        )
-        cls.base_url += "/v1"
-class TestQwen2_5_VLServer(TestOpenAIVisionServer):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = "Qwen/Qwen2.5-VL-7B-Instruct"
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.api_key = "sk-123456"
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            api_key=cls.api_key,
-            other_args=[
-                "--mem-fraction-static",
-                "0.4",
-            ],
-        )
-        cls.base_url += "/v1"
-class TestVLMContextLengthIssue(CustomTestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = "Qwen/Qwen2-VL-7B-Instruct"
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.api_key = "sk-123456"
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            api_key=cls.api_key,
-            other_args=[
-                "--context-length",
-                "300",
-                "--mem-fraction-static=0.80",
-            ],
-        )
-        cls.base_url += "/v1"
-    @classmethod
-    def tearDownClass(cls):
-        kill_process_tree(cls.process.pid)
-    def test_single_image_chat_completion(self):
-        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
-        with self.assertRaises(openai.BadRequestError) as cm:
-            client.chat.completions.create(
-                model="default",
-                messages=[
-                    {
-                        "role": "user",
-                        "content": [
-                            {
-                                "type": "image_url",
-                                "image_url": {"url": IMAGE_MAN_IRONING_URL},
-                            },
-                            {
-                                "type": "text",
-                                "text": "Give a lengthy description of this picture",
-                            },
-                        ],
-                    },
-                ],
-                temperature=0,
-            )
-        # context length is checked first, then max_req_input_len, which is calculated from the former
-        assert (
-            "Multimodal prompt is too long after expanding multimodal tokens."
-            in str(cm.exception)
-            or "is longer than the model's context length" in str(cm.exception)
-        )
-class TestMllamaServer(TestOpenAIVisionServer):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.api_key = "sk-123456"
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            api_key=cls.api_key,
-        )
-        cls.base_url += "/v1"
-    def test_video_chat_completion(self):
-        pass
-class TestMinicpmvServer(TestOpenAIVisionServer):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = "openbmb/MiniCPM-V-2_6"
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.api_key = "sk-123456"
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--trust-remote-code",
-                "--mem-fraction-static",
-                "0.4",
-            ],
-        )
-        cls.base_url += "/v1"
-class TestInternVL2_5Server(TestOpenAIVisionServer):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = "OpenGVLab/InternVL2_5-2B"
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.api_key = "sk-123456"
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=["--trust-remote-code"],
-        )
-        cls.base_url += "/v1"
-class TestMinicpmoServer(TestOpenAIVisionServer):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = "openbmb/MiniCPM-o-2_6"
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.api_key = "sk-123456"
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--trust-remote-code",
-                "--mem-fraction-static",
-                "0.7",
-            ],
-        )
-        cls.base_url += "/v1"
-    def test_audio_chat_completion(self):
-        self._test_audio_speech_completion()
-        self._test_audio_ambient_completion()
-class TestPixtralServer(TestOpenAIVisionServer):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = "mistral-community/pixtral-12b"
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.api_key = "sk-123456"
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--trust-remote-code",
-                "--mem-fraction-static",
-                "0.73",
-            ],
-        )
-        cls.base_url += "/v1"
-    def test_video_chat_completion(self):
-        pass
-class TestMistral3_1Server(TestOpenAIVisionServer):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = "unsloth/Mistral-Small-3.1-24B-Instruct-2503"
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.api_key = "sk-123456"
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--trust-remote-code",
-                "--mem-fraction-static",
-                "0.8",
-            ],
-        )
-        cls.base_url += "/v1"
-    def test_video_chat_completion(self):
-        pass
-class TestDeepseekVL2Server(TestOpenAIVisionServer):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = "deepseek-ai/deepseek-vl2-small"
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.api_key = "sk-123456"
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--trust-remote-code",
-                "--context-length",
-                "4096",
-            ],
-        )
-        cls.base_url += "/v1"
-    def test_video_chat_completion(self):
-        pass
-class TestDeepseekVL2TinyServer(TestOpenAIVisionServer):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = "deepseek-ai/deepseek-vl2-tiny"
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.api_key = "sk-123456"
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--trust-remote-code",
-                "--context-length",
-                "4096",
-            ],
-        )
-        cls.base_url += "/v1"
-    def test_video_chat_completion(self):
-        pass
-class TestJanusProServer(TestOpenAIVisionServer):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = "deepseek-ai/Janus-Pro-7B"
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.api_key = "sk-123456"
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--trust-remote-code",
-                "--mem-fraction-static",
-                "0.4",
-            ],
-        )
-        cls.base_url += "/v1"
-    def test_video_chat_completion(self):
-        pass
-    def test_single_image_chat_completion(self):
-        # Skip this test because it is flaky
-        pass
-## Skip for ci test
-# class TestLlama4Server(TestOpenAIVisionServer):
-#     @classmethod
-#     def setUpClass(cls):
-#         cls.model = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
-#         cls.base_url = DEFAULT_URL_FOR_TEST
-#         cls.api_key = "sk-123456"
-#         cls.process = popen_launch_server(
-#             cls.model,
-#             cls.base_url,
-#             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-#             other_args=[
-#                 "--chat-template",
-#                 "llama-4",
-#                 "--mem-fraction-static",
-#                 "0.8",
-#                 "--tp-size=8",
-#                 "--context-length=8192",
-#             ],
-#         )
-#         cls.base_url += "/v1"
-#     def test_video_chat_completion(self):
-#         pass
-class TestGemma3itServer(TestOpenAIVisionServer):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = "google/gemma-3-4b-it"
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.api_key = "sk-123456"
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--trust-remote-code",
-                "--mem-fraction-static",
-                "0.75",
-                "--enable-multimodal",
-            ],
-        )
-        cls.base_url += "/v1"
-    def test_video_chat_completion(self):
-        pass
-class TestKimiVLServer(TestOpenAIVisionServer):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = "moonshotai/Kimi-VL-A3B-Instruct"
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.api_key = "sk-123456"
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--trust-remote-code",
-                "--context-length",
-                "4096",
-                "--dtype",
-                "bfloat16",
-            ],
-        )
-        cls.base_url += "/v1"
-    def test_video_chat_completion(self):
-        pass
-if __name__ == "__main__":
-    unittest.main()