bring back kimi vl ci (#8537)

c0fd77e8 · Stefan He · GitHub · a4c3b121 · c0fd77e8 · c0fd77e8
Unverified Commit c0fd77e8 authored Jul 29, 2025 by Stefan He Committed by GitHub Jul 29, 2025
Show whitespace changes
Inline Side-by-side

Showing with 43 additions and 45 deletions

test/srt/test_vision_openai_server_b.py test/srt/test_vision_openai_server_b.py +21 -22

test/srt/test_vlm_input_format.py test/srt/test_vlm_input_format.py +22 -23

No files found.
--- a/test/srt/test_vision_openai_server_b.py
+++ b/test/srt/test_vision_openai_server_b.py
@@ -172,29 +172,28 @@ class TestGemma3nServer(TestOpenAIVisionServer):
        cls.base_url += "/v1"


-# commented out before https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/27 get fixed
-# class TestKimiVLServer(TestOpenAIVisionServer):
-#     @classmethod
-#     def setUpClass(cls):
-#         cls.model = "moonshotai/Kimi-VL-A3B-Instruct"
-#         cls.base_url = DEFAULT_URL_FOR_TEST
-#         cls.api_key = "sk-123456"
-#         cls.process = popen_launch_server(
-#             cls.model,
-#             cls.base_url,
-#             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-#             other_args=[
-#                 "--trust-remote-code",
-#                 "--context-length",
-#                 "4096",
-#                 "--dtype",
-#                 "bfloat16",
-#             ],
-#         )
-#         cls.base_url += "/v1"
+class TestKimiVLServer(TestOpenAIVisionServer):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "moonshotai/Kimi-VL-A3B-Instruct"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--context-length",
+                "4096",
+                "--dtype",
+                "bfloat16",
+            ],
+        )
+        cls.base_url += "/v1"

-#     def test_video_images_chat_completion(self):
-#         pass
+    def test_video_images_chat_completion(self):
+        pass


 class TestPhi4MMServer(TestOpenAIVisionServer):

--- a/test/srt/test_vlm_input_format.py
+++ b/test/srt/test_vlm_input_format.py
@@ -189,32 +189,31 @@ class TestGemmaUnderstandsImage(VLMInputTestBase, unittest.IsolatedAsyncioTestCa
        )


-# commented out before https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/27 get fixed
-# class TestKimiVLImageUnderstandsImage(
-#     VLMInputTestBase, unittest.IsolatedAsyncioTestCase
-# ):
-#     model_path = "moonshotai/Kimi-VL-A3B-Instruct"
-#     chat_template = "kimi-vl"
+class TestKimiVLImageUnderstandsImage(
+    VLMInputTestBase, unittest.IsolatedAsyncioTestCase
+):
+    model_path = "moonshotai/Kimi-VL-A3B-Instruct"
+    chat_template = "kimi-vl"

-#     @classmethod
-#     def _init_visual(cls):
-#         model = AutoModel.from_pretrained(cls.model_path, trust_remote_code=True)
-#         cls.vision_tower = model.vision_tower.eval().to(cls.device)
-#         cls.mm_projector = model.multi_modal_projector.eval().to(cls.device)
+    @classmethod
+    def _init_visual(cls):
+        model = AutoModel.from_pretrained(cls.model_path, trust_remote_code=True)
+        cls.vision_tower = model.vision_tower.eval().to(cls.device)
+        cls.mm_projector = model.multi_modal_projector.eval().to(cls.device)

-#         cls.visual = lambda tokenizer_output: cls.mm_projector(
-#             cls.vision_tower(
-#                 pixel_values=tokenizer_output["pixel_values"],
-#                 grid_hws=tokenizer_output["image_grid_hws"],
-#             )
-#         )
+        cls.visual = lambda tokenizer_output: cls.mm_projector(
+            cls.vision_tower(
+                pixel_values=tokenizer_output["pixel_values"],
+                grid_hws=tokenizer_output["image_grid_hws"],
+            )
+        )

-#     def _pixel_values_image_data(self, processor_output):
-#         return dict(
-#             modality="IMAGE",
-#             pixel_values=processor_output["pixel_values"],
-#             image_grid_hws=processor_output["image_grid_hws"],
-#         )
+    def _pixel_values_image_data(self, processor_output):
+        return dict(
+            modality="IMAGE",
+            pixel_values=processor_output["pixel_values"],
+            image_grid_hws=processor_output["image_grid_hws"],
+        )


 # not for CI: too large