Unverified Commit b5e3d603 authored by Mick's avatar Mick Committed by GitHub
Browse files

vlm: support video as an input modality (#5888)

parent 4ed57807
...@@ -198,7 +198,7 @@ class TestOpenAIVisionServer(CustomTestCase): ...@@ -198,7 +198,7 @@ class TestOpenAIVisionServer(CustomTestCase):
assert response.usage.completion_tokens > 0 assert response.usage.completion_tokens > 0
assert response.usage.total_tokens > 0 assert response.usage.total_tokens > 0
def prepare_video_messages(self, video_path): def prepare_video_images_messages(self, video_path):
# the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa # the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
# the size of the video embeds differs from the `modality` argument when preprocessed # the size of the video embeds differs from the `modality` argument when preprocessed
...@@ -208,7 +208,7 @@ class TestOpenAIVisionServer(CustomTestCase): ...@@ -208,7 +208,7 @@ class TestOpenAIVisionServer(CustomTestCase):
# from transformers import AutoTokenizer # from transformers import AutoTokenizer
from decord import VideoReader, cpu from decord import VideoReader, cpu
max_frames_num = 20 max_frames_num = 10
vr = VideoReader(video_path, ctx=cpu(0)) vr = VideoReader(video_path, ctx=cpu(0))
total_frame_num = len(vr) total_frame_num = len(vr)
uniform_sampled_frames = np.linspace( uniform_sampled_frames = np.linspace(
...@@ -229,7 +229,7 @@ class TestOpenAIVisionServer(CustomTestCase): ...@@ -229,7 +229,7 @@ class TestOpenAIVisionServer(CustomTestCase):
frame_format = { frame_format = {
"type": "image_url", "type": "image_url",
"image_url": {"url": "data:image/jpeg;base64,{}"}, "image_url": {"url": "data:image/jpeg;base64,{}"},
"modalities": "video", "modalities": "image",
} }
for base64_frame in base64_frames: for base64_frame in base64_frames:
...@@ -243,15 +243,14 @@ class TestOpenAIVisionServer(CustomTestCase): ...@@ -243,15 +243,14 @@ class TestOpenAIVisionServer(CustomTestCase):
return messages return messages
def prepare_video_messages_video_direct(self, video_path): def prepare_video_messages(self, video_path):
messages = [ messages = [
{ {
"role": "user", "role": "user",
"content": [ "content": [
{ {
"type": "image_url", "type": "video_url",
"image_url": {"url": f"video:{video_path}"}, "video_url": {"url": f"{video_path}"},
"modalities": "video",
}, },
{"type": "text", "text": "Please describe the video in detail."}, {"type": "text", "text": "Please describe the video in detail."},
], ],
...@@ -275,13 +274,57 @@ class TestOpenAIVisionServer(CustomTestCase): ...@@ -275,13 +274,57 @@ class TestOpenAIVisionServer(CustomTestCase):
f.write(response.content) f.write(response.content)
return file_path return file_path
def test_video_chat_completion(self): # this test samples frames of video as input, but not video directly
def test_video_images_chat_completion(self):
url = VIDEO_JOBS_URL
file_path = self.get_or_download_file(url)
client = openai.Client(api_key=self.api_key, base_url=self.base_url)
messages = self.prepare_video_images_messages(file_path)
response = client.chat.completions.create(
model="default",
messages=messages,
temperature=0,
max_tokens=1024,
stream=False,
)
video_response = response.choices[0].message.content
print("-" * 30)
print(f"Video images response:\n{video_response}")
print("-" * 30)
# Add assertions to validate the video response
assert (
"iPod" in video_response
or "device" in video_response
or "microphone" in video_response
), video_response
assert (
"man" in video_response
or "person" in video_response
or "individual" in video_response
or "speaker" in video_response
), video_response
assert (
"present" in video_response
or "examine" in video_response
or "display" in video_response
or "hold" in video_response
)
assert "black" in video_response or "dark" in video_response
self.assertIsNotNone(video_response)
self.assertGreater(len(video_response), 0)
def _test_video_chat_completion(self):
url = VIDEO_JOBS_URL url = VIDEO_JOBS_URL
file_path = self.get_or_download_file(url) file_path = self.get_or_download_file(url)
client = openai.Client(api_key=self.api_key, base_url=self.base_url) client = openai.Client(api_key=self.api_key, base_url=self.base_url)
# messages = self.prepare_video_messages_video_direct(file_path)
messages = self.prepare_video_messages(file_path) messages = self.prepare_video_messages(file_path)
response = client.chat.completions.create( response = client.chat.completions.create(
...@@ -301,7 +344,9 @@ class TestOpenAIVisionServer(CustomTestCase): ...@@ -301,7 +344,9 @@ class TestOpenAIVisionServer(CustomTestCase):
# Add assertions to validate the video response # Add assertions to validate the video response
assert ( assert (
"iPod" in video_response or "device" in video_response "iPod" in video_response
or "device" in video_response
or "microphone" in video_response
), f"video_response: {video_response}, should contain 'iPod' or 'device'" ), f"video_response: {video_response}, should contain 'iPod' or 'device'"
assert ( assert (
"man" in video_response "man" in video_response
......
...@@ -10,15 +10,8 @@ import requests ...@@ -10,15 +10,8 @@ import requests
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from PIL import Image from PIL import Image
from transformers import ( from transformers import AutoModel, AutoProcessor, AutoTokenizer
AutoModel,
AutoProcessor,
AutoTokenizer,
Gemma3ForConditionalGeneration,
Qwen2_5_VLForConditionalGeneration,
)
from sglang import Engine
from sglang.srt.configs.model_config import ModelConfig from sglang.srt.configs.model_config import ModelConfig
from sglang.srt.conversation import generate_chat_conv from sglang.srt.conversation import generate_chat_conv
from sglang.srt.entrypoints.openai.protocol import ChatCompletionRequest from sglang.srt.entrypoints.openai.protocol import ChatCompletionRequest
...@@ -169,107 +162,107 @@ class VisionLLMLogitsBase(unittest.IsolatedAsyncioTestCase): ...@@ -169,107 +162,107 @@ class VisionLLMLogitsBase(unittest.IsolatedAsyncioTestCase):
# TODO: MiniCPMV is not compatible with transformers==4.52.3, temporarily disabled # TODO: MiniCPMV is not compatible with transformers==4.52.3, temporarily disabled
# class TestMiniCPMVLogits(VisionLLMLogitsBase): class TestMiniCPMVLogits(VisionLLMLogitsBase):
# @classmethod @classmethod
# def setUpClass(cls): def setUpClass(cls):
# super().setUpClass() super().setUpClass()
# cls.model_path = "openbmb/MiniCPM-V-2_6" cls.model_path = "openbmb/MiniCPM-V-2_6"
# cls.tokenizer = AutoTokenizer.from_pretrained( cls.tokenizer = AutoTokenizer.from_pretrained(
# cls.model_path, trust_remote_code=True cls.model_path, trust_remote_code=True
# ) )
# cls.processor = AutoProcessor.from_pretrained( cls.processor = AutoProcessor.from_pretrained(
# cls.model_path, trust_remote_code=True cls.model_path, trust_remote_code=True
# ) )
# cls.chat_template = "minicpmv" cls.chat_template = "minicpmv"
#
# cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# cls.hf_model = ( cls.hf_model = (
# AutoModel.from_pretrained( AutoModel.from_pretrained(
# cls.model_path, torch_dtype=torch.bfloat16, trust_remote_code=True cls.model_path, torch_dtype=torch.bfloat16, trust_remote_code=True
# ) )
# .eval() .eval()
# .to(cls.device) .to(cls.device)
# ) )
# init_embedding_cache(0) init_embedding_cache()
#
# async def test_vlm_embedding_output(self): async def test_vlm_embedding_output(self):
# """ """
# Compares the embedding output of vlm Compares the embedding output of vlm
# """ """
# inputs = self.get_processor_output() inputs = self.get_processor_output()
#
# with torch.no_grad(): with torch.no_grad():
# # hf # hf
# model_inputs = { model_inputs = {
# "input_ids": inputs.input_ids, "input_ids": inputs.input_ids,
# "image_bound": inputs.image_bound, "image_bound": inputs.image_bound,
# "pixel_values": inputs.pixel_values, "pixel_values": inputs.pixel_values,
# "tgt_sizes": inputs.tgt_sizes, "tgt_sizes": inputs.tgt_sizes,
# } }
# (hf_output, _) = self.hf_model.get_vllm_embedding( (hf_output, _) = self.hf_model.get_vllm_embedding(
# model_inputs, model_inputs,
# ) )
# hf_output = hf_output.squeeze(0) hf_output = hf_output.squeeze(0)
#
# # sglang # sglang
# model = self.get_sglang_model() model = self.get_sglang_model()
# input_ids = inputs["input_ids"].to(self.device).flatten() input_ids = inputs["input_ids"].to(self.device).flatten()
#
# pixel_values = inputs["pixel_values"] pixel_values = inputs["pixel_values"]
# tgt_sizes = inputs["tgt_sizes"] tgt_sizes = inputs["tgt_sizes"]
# pixel_values_flat: List[torch.Tensor] = [] pixel_values_flat: List[torch.Tensor] = []
# tgt_sizes_flat: List[torch.Tensor] = [] tgt_sizes_flat: List[torch.Tensor] = []
# for pixel_b, tgt_b in zip(pixel_values, tgt_sizes): for pixel_b, tgt_b in zip(pixel_values, tgt_sizes):
# # per image # per image
# if len(pixel_b) != len(tgt_b): if len(pixel_b) != len(tgt_b):
# raise ValueError( raise ValueError(
# "Inconsistent N lengths, found: " "Inconsistent N lengths, found: "
# f"{len(pixel_b)} vs {len(tgt_b)}" f"{len(pixel_b)} vs {len(tgt_b)}"
# ) )
# for pixel_n, tgt_n in zip(pixel_b, tgt_b): for pixel_n, tgt_n in zip(pixel_b, tgt_b):
# pixel_values_flat += [pixel_n] pixel_values_flat += [pixel_n]
# tgt_sizes_flat += [tgt_n] tgt_sizes_flat += [tgt_n]
#
# im_start_id, im_end_id = ( im_start_id, im_end_id = (
# self.tokenizer.im_start_id, self.tokenizer.im_start_id,
# self.tokenizer.im_end_id, self.tokenizer.im_end_id,
# ) )
# slice_start_id, slice_end_id = ( slice_start_id, slice_end_id = (
# self.tokenizer.slice_start_id, self.tokenizer.slice_start_id,
# self.tokenizer.slice_end_id, self.tokenizer.slice_end_id,
# ) )
#
# image_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair( image_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair(
# input_ids=input_ids, mm_start_id=im_start_id, mm_end_id=im_end_id input_ids=input_ids, mm_start_id=im_start_id, mm_end_id=im_end_id
# ) )
# slice_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair( slice_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair(
# input_ids=input_ids, mm_start_id=slice_start_id, mm_end_id=slice_end_id input_ids=input_ids, mm_start_id=slice_start_id, mm_end_id=slice_end_id
# ) )
# image_offsets.extend(slice_offsets) image_offsets.extend(slice_offsets)
# image_offsets = sorted(image_offsets) image_offsets = sorted(image_offsets)
#
# sglang_output = embed_mm_inputs( sglang_output = embed_mm_inputs(
# mm_inputs_list=[ mm_inputs_list=[
# MultimodalInputs( MultimodalInputs(
# mm_items=[ mm_items=[
# MultimodalDataItem( MultimodalDataItem(
# pixel_values=pixel_values_flat, pixel_values=pixel_values_flat,
# image_offsets=image_offsets, offsets=image_offsets,
# tgt_size=tgt_sizes_flat, tgt_size=tgt_sizes_flat,
# modality=Modality.IMAGE, modality=Modality.IMAGE,
# pad_value=self.processor.tokenizer.unk_token_id, pad_value=self.processor.tokenizer.unk_token_id,
# ) )
# ] ]
# ), ),
# ], ],
# extend_prefix_lens=[0], extend_prefix_lens=[0],
# extend_seq_lens=[input_ids.shape[0]], extend_seq_lens=[input_ids.shape[0]],
# input_ids=input_ids, input_ids=input_ids,
# input_embedding=model.get_input_embeddings(), input_embedding=model.get_input_embeddings(),
# image_data_embedding_func=model.get_image_feature, multimodal_model=model,
# placeholder_tokens={ placeholder_tokens={
# Modality.IMAGE: self.processor.tokenizer.unk_token_id, Modality.IMAGE: self.processor.tokenizer.unk_token_id,
# }, },
# ) )
#
# self.compare_outputs(sglang_output, hf_output) self.compare_outputs(sglang_output, hf_output)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment