vlm: support video as an input modality (#5888)

b5e3d603 · Mick · GitHub · 4ed57807 · b5e3d603 · b5e3d603
Unverified Commit b5e3d603 authored Jul 10, 2025 by Mick Committed by GitHub Jul 09, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 160 additions and 122 deletions

test/srt/test_vision_openai_server_common.py test/srt/test_vision_openai_server_common.py +55 -10

test/srt/test_vlm_accuracy.py test/srt/test_vlm_accuracy.py +105 -112

No files found.
--- a/test/srt/test_vision_openai_server_common.py
+++ b/test/srt/test_vision_openai_server_common.py
@@ -198,7 +198,7 @@ class TestOpenAIVisionServer(CustomTestCase):
        assert response.usage.completion_tokens > 0
        assert response.usage.total_tokens > 0
-    def prepare_video_messages(self, video_path):
+    def prepare_video_images_messages(self, video_path):
        # the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
        # the size of the video embeds differs from the `modality` argument when preprocessed
@@ -208,7 +208,7 @@ class TestOpenAIVisionServer(CustomTestCase):
        # from transformers import AutoTokenizer
        from decord import VideoReader, cpu
-        max_frames_num = 20
+        max_frames_num = 10
        vr = VideoReader(video_path, ctx=cpu(0))
        total_frame_num = len(vr)
        uniform_sampled_frames = np.linspace(
@@ -229,7 +229,7 @@ class TestOpenAIVisionServer(CustomTestCase):
        frame_format = {
            "type": "image_url",
            "image_url": {"url": "data:image/jpeg;base64,{}"},
-            "modalities": "video",
+            "modalities": "image",
        }
        for base64_frame in base64_frames:
@@ -243,15 +243,14 @@ class TestOpenAIVisionServer(CustomTestCase):
        return messages
-    def prepare_video_messages_video_direct(self, video_path):
+    def prepare_video_messages(self, video_path):
        messages = [
            {
                "role": "user",
                "content": [
                    {
-                        "type": "image_url",
+                        "type": "video_url",
-                        "image_url": {"url": f"video:{video_path}"},
+                        "video_url": {"url": f"{video_path}"},
-                        "modalities": "video",
                    },
                    {"type": "text", "text": "Please describe the video in detail."},
                ],
@@ -275,13 +274,57 @@ class TestOpenAIVisionServer(CustomTestCase):
                f.write(response.content)
        return file_path
-    def test_video_chat_completion(self):
+    # this test samples frames of video as input, but not video directly
+    def test_video_images_chat_completion(self):
+        url = VIDEO_JOBS_URL
+        file_path = self.get_or_download_file(url)
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        messages = self.prepare_video_images_messages(file_path)
+        response = client.chat.completions.create(
+            model="default",
+            messages=messages,
+            temperature=0,
+            max_tokens=1024,
+            stream=False,
+        )
+        video_response = response.choices[0].message.content
+        print("-" * 30)
+        print(f"Video images response:\n{video_response}")
+        print("-" * 30)
+        # Add assertions to validate the video response
+        assert (
+            "iPod" in video_response
+            or "device" in video_response
+            or "microphone" in video_response
+        ), video_response
+        assert (
+            "man" in video_response
+            or "person" in video_response
+            or "individual" in video_response
+            or "speaker" in video_response
+        ), video_response
+        assert (
+            "present" in video_response
+            or "examine" in video_response
+            or "display" in video_response
+            or "hold" in video_response
+        )
+        assert "black" in video_response or "dark" in video_response
+        self.assertIsNotNone(video_response)
+        self.assertGreater(len(video_response), 0)
+    def _test_video_chat_completion(self):
        url = VIDEO_JOBS_URL
        file_path = self.get_or_download_file(url)
        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
-        # messages = self.prepare_video_messages_video_direct(file_path)
        messages = self.prepare_video_messages(file_path)
        response = client.chat.completions.create(
@@ -301,7 +344,9 @@ class TestOpenAIVisionServer(CustomTestCase):
        # Add assertions to validate the video response
        assert (
-            "iPod" in video_response or "device" in video_response
+            "iPod" in video_response
+            or "device" in video_response
+            or "microphone" in video_response
        ), f"video_response: {video_response}, should contain 'iPod' or 'device'"
        assert (
            "man" in video_response

--- a/test/srt/test_vlm_accuracy.py
+++ b/test/srt/test_vlm_accuracy.py
@@ -10,15 +10,8 @@ import requests
 import torch
 import torch.nn.functional as F
 from PIL import Image
-from transformers import (
+from transformers import AutoModel, AutoProcessor, AutoTokenizer
-    AutoModel,
-    AutoProcessor,
-    AutoTokenizer,
-    Gemma3ForConditionalGeneration,
-    Qwen2_5_VLForConditionalGeneration,
-)
-from sglang import Engine
 from sglang.srt.configs.model_config import ModelConfig
 from sglang.srt.conversation import generate_chat_conv
 from sglang.srt.entrypoints.openai.protocol import ChatCompletionRequest
@@ -169,107 +162,107 @@ class VisionLLMLogitsBase(unittest.IsolatedAsyncioTestCase):
 # TODO: MiniCPMV is not compatible with transformers==4.52.3, temporarily disabled
-# class TestMiniCPMVLogits(VisionLLMLogitsBase):
+class TestMiniCPMVLogits(VisionLLMLogitsBase):
-#     @classmethod
+    @classmethod
-#     def setUpClass(cls):
+    def setUpClass(cls):
-#         super().setUpClass()
+        super().setUpClass()
-#         cls.model_path = "openbmb/MiniCPM-V-2_6"
+        cls.model_path = "openbmb/MiniCPM-V-2_6"
-#         cls.tokenizer = AutoTokenizer.from_pretrained(
+        cls.tokenizer = AutoTokenizer.from_pretrained(
-#             cls.model_path, trust_remote_code=True
+            cls.model_path, trust_remote_code=True
-#         )
+        )
-#         cls.processor = AutoProcessor.from_pretrained(
+        cls.processor = AutoProcessor.from_pretrained(
-#             cls.model_path, trust_remote_code=True
+            cls.model_path, trust_remote_code=True
-#         )
+        )
-#         cls.chat_template = "minicpmv"
+        cls.chat_template = "minicpmv"
-#
-#         cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-#         cls.hf_model = (
+        cls.hf_model = (
-#             AutoModel.from_pretrained(
+            AutoModel.from_pretrained(
-#                 cls.model_path, torch_dtype=torch.bfloat16, trust_remote_code=True
+                cls.model_path, torch_dtype=torch.bfloat16, trust_remote_code=True
-#             )
+            )
-#             .eval()
+            .eval()
-#             .to(cls.device)
+            .to(cls.device)
-#         )
+        )
-#         init_embedding_cache(0)
+        init_embedding_cache()
-#
-#     async def test_vlm_embedding_output(self):
+    async def test_vlm_embedding_output(self):
-#         """
+        """
-#         Compares the embedding output of vlm
+        Compares the embedding output of vlm
-#         """
+        """
-#         inputs = self.get_processor_output()
+        inputs = self.get_processor_output()
-#
-#         with torch.no_grad():
+        with torch.no_grad():
-#             # hf
+            # hf
-#             model_inputs = {
+            model_inputs = {
-#                 "input_ids": inputs.input_ids,
+                "input_ids": inputs.input_ids,
-#                 "image_bound": inputs.image_bound,
+                "image_bound": inputs.image_bound,
-#                 "pixel_values": inputs.pixel_values,
+                "pixel_values": inputs.pixel_values,
-#                 "tgt_sizes": inputs.tgt_sizes,
+                "tgt_sizes": inputs.tgt_sizes,
-#             }
+            }
-#             (hf_output, _) = self.hf_model.get_vllm_embedding(
+            (hf_output, _) = self.hf_model.get_vllm_embedding(
-#                 model_inputs,
+                model_inputs,
-#             )
+            )
-#             hf_output = hf_output.squeeze(0)
+            hf_output = hf_output.squeeze(0)
-#
-#             # sglang
+            # sglang
-#             model = self.get_sglang_model()
+            model = self.get_sglang_model()
-#             input_ids = inputs["input_ids"].to(self.device).flatten()
+            input_ids = inputs["input_ids"].to(self.device).flatten()
-#
-#             pixel_values = inputs["pixel_values"]
+            pixel_values = inputs["pixel_values"]
-#             tgt_sizes = inputs["tgt_sizes"]
+            tgt_sizes = inputs["tgt_sizes"]
-#             pixel_values_flat: List[torch.Tensor] = []
+            pixel_values_flat: List[torch.Tensor] = []
-#             tgt_sizes_flat: List[torch.Tensor] = []
+            tgt_sizes_flat: List[torch.Tensor] = []
-#             for pixel_b, tgt_b in zip(pixel_values, tgt_sizes):
+            for pixel_b, tgt_b in zip(pixel_values, tgt_sizes):
-#                 # per image
+                # per image
-#                 if len(pixel_b) != len(tgt_b):
+                if len(pixel_b) != len(tgt_b):
-#                     raise ValueError(
+                    raise ValueError(
-#                         "Inconsistent N lengths, found: "
+                        "Inconsistent N lengths, found: "
-#                         f"{len(pixel_b)} vs {len(tgt_b)}"
+                        f"{len(pixel_b)} vs {len(tgt_b)}"
-#                     )
+                    )
-#                 for pixel_n, tgt_n in zip(pixel_b, tgt_b):
+                for pixel_n, tgt_n in zip(pixel_b, tgt_b):
-#                     pixel_values_flat += [pixel_n]
+                    pixel_values_flat += [pixel_n]
-#                     tgt_sizes_flat += [tgt_n]
+                    tgt_sizes_flat += [tgt_n]
-#
-#             im_start_id, im_end_id = (
+            im_start_id, im_end_id = (
-#                 self.tokenizer.im_start_id,
+                self.tokenizer.im_start_id,
-#                 self.tokenizer.im_end_id,
+                self.tokenizer.im_end_id,
-#             )
+            )
-#             slice_start_id, slice_end_id = (
+            slice_start_id, slice_end_id = (
-#                 self.tokenizer.slice_start_id,
+                self.tokenizer.slice_start_id,
-#                 self.tokenizer.slice_end_id,
+                self.tokenizer.slice_end_id,
-#             )
+            )
-#
-#             image_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair(
+            image_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair(
-#                 input_ids=input_ids, mm_start_id=im_start_id, mm_end_id=im_end_id
+                input_ids=input_ids, mm_start_id=im_start_id, mm_end_id=im_end_id
-#             )
+            )
-#             slice_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair(
+            slice_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair(
-#                 input_ids=input_ids, mm_start_id=slice_start_id, mm_end_id=slice_end_id
+                input_ids=input_ids, mm_start_id=slice_start_id, mm_end_id=slice_end_id
-#             )
+            )
-#             image_offsets.extend(slice_offsets)
+            image_offsets.extend(slice_offsets)
-#             image_offsets = sorted(image_offsets)
+            image_offsets = sorted(image_offsets)
-#
-#             sglang_output = embed_mm_inputs(
+            sglang_output = embed_mm_inputs(
-#                 mm_inputs_list=[
+                mm_inputs_list=[
-#                     MultimodalInputs(
+                    MultimodalInputs(
-#                         mm_items=[
+                        mm_items=[
-#                             MultimodalDataItem(
+                            MultimodalDataItem(
-#                                 pixel_values=pixel_values_flat,
+                                pixel_values=pixel_values_flat,
-#                                 image_offsets=image_offsets,
+                                offsets=image_offsets,
-#                                 tgt_size=tgt_sizes_flat,
+                                tgt_size=tgt_sizes_flat,
-#                                 modality=Modality.IMAGE,
+                                modality=Modality.IMAGE,
-#                                 pad_value=self.processor.tokenizer.unk_token_id,
+                                pad_value=self.processor.tokenizer.unk_token_id,
-#                             )
+                            )
-#                         ]
+                        ]
-#                     ),
+                    ),
-#                 ],
+                ],
-#                 extend_prefix_lens=[0],
+                extend_prefix_lens=[0],
-#                 extend_seq_lens=[input_ids.shape[0]],
+                extend_seq_lens=[input_ids.shape[0]],
-#                 input_ids=input_ids,
+                input_ids=input_ids,
-#                 input_embedding=model.get_input_embeddings(),
+                input_embedding=model.get_input_embeddings(),
-#                 image_data_embedding_func=model.get_image_feature,
+                multimodal_model=model,
-#                 placeholder_tokens={
+                placeholder_tokens={
-#                     Modality.IMAGE: self.processor.tokenizer.unk_token_id,
+                    Modality.IMAGE: self.processor.tokenizer.unk_token_id,
-#                 },
+                },
-#             )
+            )
-#
-#         self.compare_outputs(sglang_output, hf_output)
+        self.compare_outputs(sglang_output, hf_output)