Unverified Commit bcc213df authored by Mick's avatar Mick Committed by GitHub
Browse files

Model: Support Qwen 2.5 vl (#3258)

parent 39416e39
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
- Llama / Llama 2 / Llama 3 / Llama 3.1 / Llama 3.2 - Llama / Llama 2 / Llama 3 / Llama 3.1 / Llama 3.2
- Mistral / Mixtral / Mistral NeMo / Mistral Small 3 - Mistral / Mixtral / Mistral NeMo / Mistral Small 3
- Gemma / Gemma 2 - Gemma / Gemma 2
- Qwen / Qwen 2 / Qwen 2 MoE / Qwen 2 VL - Qwen / Qwen 2 / Qwen 2 MoE / Qwen 2 VL / Qwen 2.5 VL
- DeepSeek / DeepSeek 2 / [DeepSeek 3](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3) - DeepSeek / DeepSeek 2 / [DeepSeek 3](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3)
- OLMoE - OLMoE
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/) - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
...@@ -54,7 +54,7 @@ To support a new model in SGLang, you only need to add a single file under [SGLa ...@@ -54,7 +54,7 @@ To support a new model in SGLang, you only need to add a single file under [SGLa
You can learn from existing model implementations and create new files for the new models. You can learn from existing model implementations and create new files for the new models.
For most models, you should be able to find a similar model to start with (e.g., starting from Llama). For most models, you should be able to find a similar model to start with (e.g., starting from Llama).
## How to Support a New vision LLM ## How to Support a New vLM
To support a new vision-language model (vLM) in SGLang, there are several key components in addition to the standard LLM. To support a new vision-language model (vLM) in SGLang, there are several key components in addition to the standard LLM.
......
...@@ -427,6 +427,8 @@ def match_chat_ml(model_path: str): ...@@ -427,6 +427,8 @@ def match_chat_ml(model_path: str):
if "tinyllama" in model_path: if "tinyllama" in model_path:
return get_chat_template("chatml") return get_chat_template("chatml")
# Now the suffix for qwen2 chat model is "instruct" # Now the suffix for qwen2 chat model is "instruct"
if "qwen" in model_path and "vl" in model_path:
return get_chat_template("qwen2-vl")
if "qwen" in model_path: if "qwen" in model_path:
if "vl" in model_path: if "vl" in model_path:
return get_chat_template("qwen2-vl") return get_chat_template("qwen2-vl")
...@@ -443,6 +445,12 @@ def match_chat_ml(model_path: str): ...@@ -443,6 +445,12 @@ def match_chat_ml(model_path: str):
return get_chat_template("chatml-llava") return get_chat_template("chatml-llava")
@register_chat_template_matching_function
def match_chat_minicpm(model_path: str):
if "minicpm" in model_path:
return get_chat_template("minicpmv")
@register_chat_template_matching_function @register_chat_template_matching_function
def match_chat_yi(model_path: str): def match_chat_yi(model_path: str):
model_path = model_path.lower() model_path = model_path.lower()
......
from sglang.srt.configs.chatglm import ChatGLMConfig from sglang.srt.configs.chatglm import ChatGLMConfig
from sglang.srt.configs.dbrx import DbrxConfig from sglang.srt.configs.dbrx import DbrxConfig
from sglang.srt.configs.exaone import ExaoneConfig from sglang.srt.configs.exaone import ExaoneConfig
from sglang.srt.configs.qwen2vl import Qwen2VLConfig, Qwen2VLVisionConfig from sglang.srt.configs.qwen2_5_vl_config import (
Qwen2_5_VLConfig,
Qwen2_5_VLVisionConfig,
)
__all__ = [ __all__ = [
"ExaoneConfig", "ExaoneConfig",
"Qwen2VLConfig",
"Qwen2VLVisionConfig",
"ChatGLMConfig", "ChatGLMConfig",
"DbrxConfig", "DbrxConfig",
"Qwen2_5_VLConfig",
"Qwen2_5_VLVisionConfig",
] ]
...@@ -403,6 +403,7 @@ def is_multimodal_model(model_architectures: List[str]): ...@@ -403,6 +403,7 @@ def is_multimodal_model(model_architectures: List[str]):
or "LlavaVidForCausalLM" in model_architectures or "LlavaVidForCausalLM" in model_architectures
or "MllamaForConditionalGeneration" in model_architectures or "MllamaForConditionalGeneration" in model_architectures
or "Qwen2VLForConditionalGeneration" in model_architectures or "Qwen2VLForConditionalGeneration" in model_architectures
or "Qwen2_5_VLForConditionalGeneration" in model_architectures
or "MiniCPMV" in model_architectures or "MiniCPMV" in model_architectures
): ):
return True return True
......
This diff is collapsed.
# coding=utf-8
# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Qwen2VL model configuration"""
import os
from typing import Union
from transformers import PretrainedConfig
class Qwen2VLVisionConfig(PretrainedConfig):
model_type = "qwen2_vl"
def __init__(
self,
depth=32,
embed_dim=1280,
hidden_size=3584,
hidden_act="quick_gelu",
mlp_ratio=4,
num_heads=16,
in_channels=3,
patch_size=14,
spatial_merge_size=2,
temporal_patch_size=2,
**kwargs,
):
super().__init__(**kwargs)
self.depth = depth
self.embed_dim = embed_dim
self.hidden_size = hidden_size
self.hidden_act = hidden_act
self.mlp_ratio = mlp_ratio
self.num_heads = num_heads
self.in_channels = in_channels
self.patch_size = patch_size
self.spatial_merge_size = spatial_merge_size
self.temporal_patch_size = temporal_patch_size
@classmethod
def from_pretrained(
cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
) -> "PretrainedConfig":
cls._set_token_in_kwargs(kwargs)
config_dict, kwargs = cls.get_config_dict(
pretrained_model_name_or_path, **kwargs
)
if config_dict.get("model_type") == "qwen2_vl":
config_dict = config_dict["vision_config"]
return cls.from_dict(config_dict, **kwargs)
class Qwen2VLConfig(PretrainedConfig):
model_type = "qwen2_vl"
def __init__(
self,
vocab_size=152064,
hidden_size=8192,
intermediate_size=29568,
num_hidden_layers=80,
num_attention_heads=64,
num_key_value_heads=8,
hidden_act="silu",
max_position_embeddings=32768,
initializer_range=0.02,
rms_norm_eps=1e-05,
use_cache=True,
tie_word_embeddings=False,
rope_theta=1000000.0,
use_sliding_window=False,
sliding_window=4096,
max_window_layers=80,
attention_dropout=0.0,
vision_config=None,
rope_scaling=None,
**kwargs,
):
if isinstance(vision_config, dict):
self.vision_config = Qwen2VLVisionConfig(**vision_config)
elif vision_config is None:
self.vision_config = Qwen2VLVisionConfig()
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.use_sliding_window = use_sliding_window
self.sliding_window = sliding_window
self.max_window_layers = max_window_layers
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.attention_dropout = attention_dropout
self.rope_scaling = rope_scaling
# NOTE(HandH1998): This is necessary for configuring the `rope_type`` of qwen2vl models after removing dependencies on vllm.
if self.rope_scaling is not None and "type" in self.rope_scaling:
if self.rope_scaling["type"] == "mrope":
self.rope_scaling["type"] = "default"
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
...@@ -30,16 +30,15 @@ from transformers import ( ...@@ -30,16 +30,15 @@ from transformers import (
) )
from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
from sglang.srt.configs import ChatGLMConfig, DbrxConfig, ExaoneConfig, Qwen2VLConfig from sglang.srt.configs import ChatGLMConfig, DbrxConfig, ExaoneConfig, Qwen2_5_VLConfig
_CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
ChatGLMConfig.model_type: ChatGLMConfig, ChatGLMConfig.model_type: ChatGLMConfig,
DbrxConfig.model_type: DbrxConfig, DbrxConfig.model_type: DbrxConfig,
ExaoneConfig.model_type: ExaoneConfig, ExaoneConfig.model_type: ExaoneConfig,
Qwen2VLConfig.model_type: Qwen2VLConfig, Qwen2_5_VLConfig.model_type: Qwen2_5_VLConfig,
} }
for name, cls in _CONFIG_REGISTRY.items(): for name, cls in _CONFIG_REGISTRY.items():
with contextlib.suppress(ValueError): with contextlib.suppress(ValueError):
AutoConfig.register(name, cls) AutoConfig.register(name, cls)
......
# TODO: also move pad_input_ids into this module # TODO: also move pad_input_ids into this module
import asyncio import asyncio
import concurrent.futures import concurrent.futures
import dataclasses
import logging import logging
import multiprocessing as mp import multiprocessing as mp
import os import os
...@@ -8,6 +9,7 @@ from abc import ABC, abstractmethod ...@@ -8,6 +9,7 @@ from abc import ABC, abstractmethod
from typing import List, Optional, Union from typing import List, Optional, Union
import numpy as np import numpy as np
import PIL
import transformers import transformers
from decord import VideoReader, cpu from decord import VideoReader, cpu
from PIL import Image from PIL import Image
...@@ -34,11 +36,22 @@ def init_global_processor(server_args: ServerArgs): ...@@ -34,11 +36,22 @@ def init_global_processor(server_args: ServerArgs):
) )
@dataclasses.dataclass
class BaseImageProcessorOutput:
image_hashes: list[int]
image_sizes: list[int]
all_frames: [PIL.Image]
# input_text, with each frame of video/image represented with a image_token
input_text: str
class BaseImageProcessor(ABC): class BaseImageProcessor(ABC):
def __init__(self, hf_config, server_args, _processor): def __init__(self, hf_config, server_args, _processor):
self.hf_config = hf_config self.hf_config = hf_config
self._processor = _processor self._processor = _processor
self.server_args = server_args self.server_args = server_args
# FIXME: not accurate, model and image specific
self.NUM_TOKEN_PER_FRAME = 330
self.executor = concurrent.futures.ProcessPoolExecutor( self.executor = concurrent.futures.ProcessPoolExecutor(
initializer=init_global_processor, initializer=init_global_processor,
...@@ -48,9 +61,128 @@ class BaseImageProcessor(ABC): ...@@ -48,9 +61,128 @@ class BaseImageProcessor(ABC):
) )
@abstractmethod @abstractmethod
async def process_images_async(self, image_data, input_text, **kwargs): async def process_images_async(
self, image_data, input_text, max_req_input_len, **kwargs
):
pass pass
def get_estimated_frames_list(self, image_data):
"""
estimate the total frame count from all visual input
"""
# Before processing inputs
estimated_frames_list = []
for image in image_data:
if isinstance(image, str) and image.startswith("video:"):
path = image[len("video:") :]
# Estimate frames for the video
vr = VideoReader(path, ctx=cpu(0))
num_frames = len(vr)
else:
# For images, each contributes one frame
num_frames = 1
estimated_frames_list.append(num_frames)
return estimated_frames_list
def encode_video(self, video_path, frame_count_limit=None):
if not os.path.exists(video_path):
logger.error(f"Video {video_path} does not exist")
return []
if frame_count_limit == 0:
return []
def uniform_sample(l, n):
gap = len(l) / n
idxs = [int(i * gap + gap / 2) for i in range(n)]
return [l[i] for i in idxs]
vr = VideoReader(video_path, ctx=cpu(0))
sample_fps = round(vr.get_avg_fps() / 1) # FPS
frame_idx = [i for i in range(0, len(vr), sample_fps)]
if frame_count_limit is not None and len(frame_idx) > frame_count_limit:
frame_idx = uniform_sample(frame_idx, frame_count_limit)
frames = vr.get_batch(frame_idx).asnumpy()
frames = [Image.fromarray(v.astype("uint8")) for v in frames]
return frames
def load_images(
self,
max_req_input_len: int,
input_ids: list,
image_data,
image_token: str,
) -> BaseImageProcessorOutput:
"""
Each frame of video/image will be replaced by a single image token
"""
image_hashes, image_sizes = [], []
all_frames = []
new_text_parts = []
if isinstance(input_ids, list):
assert len(input_ids) and isinstance(input_ids[0], int)
input_text = self._processor.tokenizer.decode(input_ids)
else:
input_text = input_ids
text_parts = input_text.split(image_token)
# roughly calculate the max number of frames under the max_req_input_len limit
def calculate_max_num_frames() -> int:
ret = (max_req_input_len - len(input_ids)) // self.NUM_TOKEN_PER_FRAME
return min(ret, 100)
MAX_NUM_FRAMES = calculate_max_num_frames()
estimated_frames_list = self.get_estimated_frames_list(image_data=image_data)
total_frame_count = sum(estimated_frames_list)
# a heuristic value, suggesting the maximum fraction of frames to embed from all visual inputs.
# e.g., 0.1 suggests that 1 frame out of 10 input frames should be used
scaling_factor = min(1.0, MAX_NUM_FRAMES / total_frame_count)
# Process each input with allocated frames
for image_index, (image, estimated_frames) in enumerate(
zip(image_data, estimated_frames_list)
):
if len(all_frames) >= MAX_NUM_FRAMES:
frames_to_process = 0
else:
frames_to_process = max(1, int(estimated_frames * scaling_factor))
if frames_to_process == 0:
frames = []
else:
try:
if isinstance(image, str) and image.startswith("video:"):
path = image[len("video:") :]
frames = self.encode_video(
path, frame_count_limit=frames_to_process
)
else:
raw_image, _size = load_image(image)
frames = [raw_image]
if len(frames) == 0:
continue
except FileNotFoundError as e:
print(e)
return None
image_sizes += frames[0].size * len(frames)
image_hashes += [hash(image)] * len(frames)
all_frames += frames
new_text_parts.append(text_parts[image_index])
if frames_to_process != 0:
new_text_parts.append(image_token * len(frames))
assert frames_to_process == len(frames)
new_text_parts.append(text_parts[-1])
input_text = "".join(new_text_parts)
return BaseImageProcessorOutput(
image_hashes, image_sizes, all_frames, input_text
)
class DummyImageProcessor(BaseImageProcessor): class DummyImageProcessor(BaseImageProcessor):
def __init__(self): def __init__(self):
...@@ -248,9 +380,9 @@ class MiniCPMVImageProcessor(BaseImageProcessor): ...@@ -248,9 +380,9 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
text=input_text, images=images, return_tensors="pt" text=input_text, images=images, return_tensors="pt"
) )
return { return {
"input_ids": result["input_ids"], "input_ids": result.input_ids,
"pixel_values": result["pixel_values"], "pixel_values": result.pixel_values,
"tgt_sizes": result["tgt_sizes"], "tgt_sizes": result.tgt_sizes,
} }
async def _process_images(self, images, input_text): async def _process_images(self, images, input_text):
...@@ -278,124 +410,20 @@ class MiniCPMVImageProcessor(BaseImageProcessor): ...@@ -278,124 +410,20 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
): ):
if not image_data: if not image_data:
return None return None
if not isinstance(image_data, list): if not isinstance(image_data, list):
image_data = [image_data] image_data = [image_data]
image_hashes, image_sizes = [], [] base_output = self.load_images(
all_frames = [] max_req_input_len, input_ids, image_data, self.IMAGE_TOKEN
)
# roughly calculate the max number of frames under the max_req_input_len limit if base_output is None:
def calculate_max_num_frames() -> int: return None
# Model-specific
NUM_TOKEN_PER_FRAME = 330
ret = (max_req_input_len - len(input_ids)) // NUM_TOKEN_PER_FRAME
return min(ret, 100)
MAX_NUM_FRAMES = calculate_max_num_frames()
# print(f"MAX_NUM_FRAMES: {MAX_NUM_FRAMES}")
def get_estimated_frames_list():
"""
estimate the total frame count from all visual input
"""
# Before processing inputs
estimated_frames_list = []
for image in image_data:
if isinstance(image, str) and image.startswith("video:"):
path = image[len("video:") :]
# Estimate frames for the video
vr = VideoReader(path, ctx=cpu(0))
num_frames = len(vr)
else:
# For images, each contributes one frame
num_frames = 1
estimated_frames_list.append(num_frames)
return estimated_frames_list
estimated_frames_list = get_estimated_frames_list()
total_frame_count = sum(estimated_frames_list)
scaling_factor = min(1.0, MAX_NUM_FRAMES / total_frame_count)
def encode_video(video_path, frame_count_limit=None):
if not os.path.exists(video_path):
logger.error(f"Video {video_path} does not exist")
return []
if frame_count_limit == 0:
return []
def uniform_sample(l, n):
gap = len(l) / n
idxs = [int(i * gap + gap / 2) for i in range(n)]
return [l[i] for i in idxs]
vr = VideoReader(video_path, ctx=cpu(0))
sample_fps = round(vr.get_avg_fps() / 1) # FPS
frame_idx = [i for i in range(0, len(vr), sample_fps)]
if frame_count_limit is not None and len(frame_idx) > frame_count_limit:
frame_idx = uniform_sample(frame_idx, frame_count_limit)
frames = vr.get_batch(frame_idx).asnumpy()
frames = [Image.fromarray(v.astype("uint8")) for v in frames]
return frames
if isinstance(input_ids, list):
assert len(input_ids) and isinstance(input_ids[0], int)
input_text = self._processor.tokenizer.decode(input_ids)
else:
input_text = input_ids
# MiniCPMV requires each frame of video as a single image token
text_parts = input_text.split(self.IMAGE_TOKEN)
new_text_parts = []
# Process each input with allocated frames
for image_index, (image, estimated_frames) in enumerate(
zip(image_data, estimated_frames_list)
):
if len(all_frames) >= MAX_NUM_FRAMES:
frames_to_process = 0
else:
frames_to_process = max(1, int(estimated_frames * scaling_factor))
if frames_to_process == 0:
frames = []
else:
try:
if isinstance(image, str) and image.startswith("video:"):
path = image[len("video:") :]
frames = encode_video(path, frame_count_limit=frames_to_process)
else:
raw_image, _size = load_image(image)
frames = [raw_image]
if len(frames) == 0:
continue
except FileNotFoundError as e:
print(e)
return None
image_sizes += frames[0].size * len(frames)
image_hashes += [hash(image)] * len(frames)
all_frames += frames
assert frames_to_process == len(frames)
new_text_parts.append(text_parts[image_index])
if frames_to_process != 0:
new_text_parts.append(self.IMAGE_TOKEN * len(frames))
new_text_parts.append(text_parts[-1])
input_text = "".join(new_text_parts)
if len(all_frames) == 0: if len(base_output.all_frames) == 0:
return None return None
res = await self._process_images(images=all_frames, input_text=input_text) res = await self._process_images(
pixel_values = res["pixel_values"] images=base_output.all_frames, input_text=base_output.input_text
tgt_sizes = res["tgt_sizes"] )
input_ids = res["input_ids"]
# Collect special token ids # Collect special token ids
tokenizer = self._processor.tokenizer tokenizer = self._processor.tokenizer
...@@ -405,10 +433,10 @@ class MiniCPMVImageProcessor(BaseImageProcessor): ...@@ -405,10 +433,10 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
slice_start_id = [tokenizer.slice_start_id] slice_start_id = [tokenizer.slice_start_id]
slice_end_id = [tokenizer.slice_end_id] slice_end_id = [tokenizer.slice_end_id]
return { return {
"input_ids": input_ids.flatten().tolist(), "input_ids": res["input_ids"].flatten().tolist(),
"pixel_values": pixel_values, "pixel_values": res["pixel_values"],
"tgt_sizes": tgt_sizes, "tgt_sizes": res["tgt_sizes"],
"image_hashes": image_hashes, "image_hashes": base_output.image_hashes,
"modalities": request_obj.modalities or ["image"], "modalities": request_obj.modalities or ["image"],
"im_start_id": im_start_id, "im_start_id": im_start_id,
"im_end_id": im_end_id, "im_end_id": im_end_id,
...@@ -536,13 +564,80 @@ class Qwen2VLImageProcessor(BaseImageProcessor): ...@@ -536,13 +564,80 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
} }
class Qwen2_5VLImageProcessor(BaseImageProcessor):
def __init__(self, hf_config, server_args, _processor):
super().__init__(hf_config, server_args, _processor)
self.IMAGE_TOKEN = "<|vision_start|><|image_pad|><|vision_end|>"
self.IM_START_TOKEN_ID = hf_config.vision_start_token_id
self.IM_END_TOKEN_ID = hf_config.vision_end_token_id
self.NUM_TOKEN_PER_FRAME = 770
@staticmethod
def _process_images_task(images, input_text):
result = global_processor.__call__(
text=input_text, images=images, return_tensors="pt"
)
return {
"input_ids": result.input_ids,
"pixel_values": result.pixel_values,
"image_grid_thws": result.image_grid_thw,
}
async def _process_images(self, images, input_text) -> dict:
if self.executor is not None:
loop = asyncio.get_event_loop()
return await loop.run_in_executor(
self.executor,
Qwen2_5VLImageProcessor._process_images_task,
images,
input_text,
)
else:
return self._process_images_task(images, input_text)
async def process_images_async(
self,
image_data: List[Union[str, bytes]],
input_ids,
request_obj,
max_req_input_len,
*args,
**kwargs,
):
if not image_data:
return None
if isinstance(image_data, str):
image_data = [image_data]
image_token = self.IMAGE_TOKEN
base_output = self.load_images(
max_req_input_len, input_ids, image_data, image_token
)
ret = await self._process_images(base_output.all_frames, base_output.input_text)
return {
"input_ids": ret["input_ids"].flatten().tolist(),
"pixel_values": ret["pixel_values"],
"image_hashes": base_output.image_hashes,
"modalities": request_obj.modalities or ["image"],
"image_grid_thws": ret["image_grid_thws"],
"im_start_id": self.IM_START_TOKEN_ID,
"im_end_id": self.IM_END_TOKEN_ID,
}
def get_image_processor( def get_image_processor(
hf_config, server_args: ServerArgs, processor hf_config, server_args: ServerArgs, processor
) -> BaseImageProcessor: ) -> BaseImageProcessor:
if "MllamaForConditionalGeneration" in hf_config.architectures: if "MllamaForConditionalGeneration" in hf_config.architectures:
return MllamaImageProcessor(hf_config, server_args, processor) return MllamaImageProcessor(hf_config, server_args, processor)
elif "Qwen2VLForConditionalGeneration" in hf_config.architectures: elif "Qwen2VLForConditionalGeneration" in hf_config.architectures:
return Qwen2VLImageProcessor(hf_config, server_args, processor.image_processor)
return Qwen2VLImageProcessor(hf_config, server_args, processor)
elif "Qwen2_5_VLForConditionalGeneration" in hf_config.architectures:
return Qwen2_5VLImageProcessor(hf_config, server_args, processor)
elif "MiniCPMV" in hf_config.architectures: elif "MiniCPMV" in hf_config.architectures:
return MiniCPMVImageProcessor(hf_config, server_args, processor) return MiniCPMVImageProcessor(hf_config, server_args, processor)
else: else:
......
This diff is collapsed.
...@@ -31,8 +31,9 @@ import torch ...@@ -31,8 +31,9 @@ import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from einops import rearrange from einops import rearrange
from transformers import Qwen2VLConfig
from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig
from sglang.srt.configs import Qwen2VLConfig, Qwen2VLVisionConfig
from sglang.srt.hf_transformers_utils import get_processor from sglang.srt.hf_transformers_utils import get_processor
from sglang.srt.layers.activation import QuickGELU from sglang.srt.layers.activation import QuickGELU
from sglang.srt.layers.attention.vision import VisionAttention from sglang.srt.layers.attention.vision import VisionAttention
......
...@@ -252,6 +252,18 @@ class TestOpenAIVisionServer(unittest.TestCase): ...@@ -252,6 +252,18 @@ class TestOpenAIVisionServer(unittest.TestCase):
print("-" * 30) print("-" * 30)
# Add assertions to validate the video response # Add assertions to validate the video response
assert "iPod" in video_response or "device" in video_response, video_response
assert (
"man" in video_response
or "person" in video_response
or "individual" in video_response
), video_response
assert (
"present" in video_response
or "examine" in video_response
or "display" in video_response
)
assert "black" in video_response or "dark" in video_response
self.assertIsNotNone(video_response) self.assertIsNotNone(video_response)
self.assertGreater(len(video_response), 0) self.assertGreater(len(video_response), 0)
...@@ -366,6 +378,30 @@ class TestQWen2VLServer(TestOpenAIVisionServer): ...@@ -366,6 +378,30 @@ class TestQWen2VLServer(TestOpenAIVisionServer):
cls.base_url += "/v1" cls.base_url += "/v1"
class TestQWen2_5_VLServer(TestOpenAIVisionServer):
@classmethod
def setUpClass(cls):
cls.model = "Qwen/Qwen2.5-VL-7B-Instruct"
cls.base_url = DEFAULT_URL_FOR_TEST
cls.api_key = "sk-123456"
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
api_key=cls.api_key,
other_args=[
"--chat-template",
"qwen2-vl",
# FIXME: workaround to chunked prefill within image embeds
"--chunked-prefill-size",
"10000",
"--mem-fraction-static",
"0.4",
],
)
cls.base_url += "/v1"
class TestQWen2VLServerContextLengthIssue(unittest.TestCase): class TestQWen2VLServerContextLengthIssue(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment