Unverified Commit bcf0731a authored by myselvess's avatar myselvess Committed by GitHub
Browse files

[New Model] support new model ovis2.6 (#34426)


Signed-off-by: default avatarmyselvess <23743269+myselvess@users.noreply.github.com>
parent ec090c24
......@@ -728,6 +728,8 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `OpenPanguVLForConditionalGeneration` | openpangu-VL | T + I<sup>E+</sup> + V<sup>E+</sup> |`FreedomIntelligence/openPangu-VL-7B` | ✅︎ | ✅︎ |
| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ |
| `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | |
| `Ovis2_6ForCausalLM` | Ovis2.6 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.6-2B`, etc. | | |
| `Ovis2_6_MoeForCausalLM` | Ovis2.6 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.6-30B-A3B`, etc. | | |
| `PaddleOCRVLForConditionalGeneration` | Paddle-OCR | T + I<sup>+</sup> | `PaddlePaddle/PaddleOCR-VL`, etc. | | |
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | ✅︎ | ✅︎ |
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
......
......@@ -915,6 +915,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
},
),
"Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True),
"Ovis2_6ForCausalLM": _HfExamplesInfo(
"AIDC-AI/Ovis2.6-2B", is_available_online=False, trust_remote_code=True
),
"Ovis2_6_MoeForCausalLM": _HfExamplesInfo(
"AIDC-AI/Ovis2.6-30B-A3B", trust_remote_code=True
),
"PaddleOCRVLForConditionalGeneration": _HfExamplesInfo(
"PaddlePaddle/PaddleOCR-VL",
trust_remote_code=True,
......
......@@ -42,21 +42,12 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
IMAGE_TOKEN = "<image>"
IMAGE_PLACEHOLDER_ID = 151669
VIDEO_TOKEN = "<video>"
INDICATOR_IDS = [-301, -302, -303, -304]
IMAGE_PAD_TOKEN_MAP = {
"gemma2": "<unused0>",
"llama": "<|reserved_special_token_0|>",
"qwen2": "<|image_pad|>",
"qwen3": "<|image_pad|>",
}
IMAGE_PAD_TOKEN_ID_MAP = {
"gemma2": 7,
"llama": 128002,
"qwen2": 151655,
"qwen3": 151655,
}
VIDEO_PLACEHOLDER_ID = 151670
INDICATOR_IDS = [151672, 151673, 151674, 151675]
IMAGE_PAD_TOKEN_ID = 151655
THINK_END_TOKEN_ID = 151668
class Ovis2_5ImagePatchInputs(TensorSchema):
......@@ -187,17 +178,11 @@ class Ovis2_5ProcessingInfo(BaseProcessingInfo):
vit_config = self.get_hf_config().vit_config
return self.ctx.get_hf_processor(
Ovis2_5Processor,
image_pad_token=self.get_image_pad_token(),
patch_size=vit_config.patch_size,
hidden_stride=vit_config.hidden_stride,
temporal_patch_size=vit_config.temporal_patch_size,
)
def get_image_pad_token(self) -> str:
hf_text_config = self.get_hf_config().get_text_config()
text_model_type = hf_text_config.model_type
return IMAGE_PAD_TOKEN_MAP.get(text_model_type)
def get_image_processor(self) -> BaseImageProcessor:
return self.get_hf_processor().image_processor # type: ignore
......@@ -342,9 +327,9 @@ class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo])
hf_config = self.info.get_hf_config()
vte_vocab_size = hf_config.visual_vocab_size
return [
vte_vocab_size - len(INDICATOR_IDS) + abs(x + 300) - 1
vte_vocab_size - len(INDICATOR_IDS) + (x - INDICATOR_IDS[0])
for x in visual_indicators
if x < -300
if x >= INDICATOR_IDS[0]
]
def _call_hf_processor(
......@@ -417,6 +402,14 @@ class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo])
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargsItems,
) -> list[PromptReplacement]:
tokenizer = self.info.get_tokenizer()
vocab = tokenizer.get_vocab()
placeholder = {
"image": vocab[IMAGE_TOKEN],
"video": vocab[VIDEO_TOKEN],
}
def get_replacement_ovis(item_idx, modality: str):
if modality == "image":
out_item = out_mm_kwargs["image"][item_idx]
......@@ -432,7 +425,7 @@ class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo])
return [
PromptReplacement(
modality=modality,
target=IMAGE_TOKEN if modality == "image" else VIDEO_TOKEN,
target=[placeholder[modality]],
replacement=partial(get_replacement_ovis, modality=modality),
)
for modality in ("image", "video")
......@@ -476,8 +469,7 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
)
self.vte = VisualEmbedding(config.visual_vocab_size, config.hidden_size)
text_model_type = self.config.get_text_config().model_type
self.image_pad_token_id = IMAGE_PAD_TOKEN_ID_MAP[text_model_type]
self.image_pad_token_id: int = IMAGE_PAD_TOKEN_ID
self.make_empty_intermediate_tensors = (
self.get_language_model().make_empty_intermediate_tensors
......
......@@ -425,6 +425,8 @@ _MULTIMODAL_MODELS = {
),
"Ovis": ("ovis", "Ovis"),
"Ovis2_5": ("ovis2_5", "Ovis2_5"),
"Ovis2_6ForCausalLM": ("ovis2_5", "Ovis2_5"),
"Ovis2_6_MoeForCausalLM": ("ovis2_5", "Ovis2_5"),
"PaddleOCRVLForConditionalGeneration": (
"paddleocr_vl",
"PaddleOCRVLForConditionalGeneration",
......
......@@ -582,7 +582,6 @@ class Siglip2VisionTransformer(nn.Module):
hidden_states = self.embeddings(pixel_values, grid_thws)
last_hidden_state = self.encoder(hidden_states, grid_thws)
last_hidden_state = self.post_layernorm(last_hidden_state)
return last_hidden_state
......
......@@ -78,17 +78,32 @@ class Ovis2_5Processor(ProcessorMixin):
@cached_property
def extra_special_tokens(self):
image_pad_token_id = self.tokenizer.get_vocab()[self.image_pad_token]
extra_special_tokens = {
"image_token": -200,
"video_token": -201,
"visual_atom": -300,
"image_start": -301,
"image_end": -302,
"video_start": -303,
"video_end": -304,
"image_pad": image_pad_token_id,
vocab = self.tokenizer.get_vocab()
required_tokens = {
"image_token": "<image>",
"video_token": "<video>",
"visual_atom": "<ovis_visual_atom>",
"image_start": "<ovis_image_start>",
"image_end": "<ovis_image_end>",
"video_start": "<ovis_video_start>",
"video_end": "<ovis_video_end>",
"image_pad": "<|image_pad|>",
}
extra_special_tokens = {}
suggestion = (
"please add '<image>', '<video>', '<ovis_visual_atom>', "
"'<ovis_image_start>', '<ovis_image_end>', '<ovis_video_start>', "
"'<ovis_video_end>' in 'additional_special_tokens' of "
"tokenizer_config.json, You can refer to "
"https://huggingface.co/AIDC-AI/Ovis2.6-30B-A3B/blob/main/tokenizer_config.json"
)
for key, token_name in required_tokens.items():
if token_name not in vocab:
raise ValueError(f"Can not find {token_name}, {suggestion}")
extra_special_tokens[key] = vocab[token_name]
return extra_special_tokens
def __call__(
......@@ -156,9 +171,6 @@ class Ovis2_5Processor(ProcessorMixin):
- **second_per_grid_ts** -- list of video seconds per time grid.
Returned when `videos` is not `None`.
"""
min_pixels = kwargs.pop("min_pixels", MIN_PIXELS)
max_pixels = kwargs.pop("max_pixels", MAX_PIXELS)
output_kwargs = self._merge_kwargs(
Ovis2_5ProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
......@@ -175,8 +187,6 @@ class Ovis2_5Processor(ProcessorMixin):
for image in images if isinstance(images, list) else [images]:
pixel_values, image_placeholders, grid = self.preprocess_multidata(
images=image,
min_pixels=min_pixels,
max_pixels=max_pixels,
**output_kwargs["images_kwargs"],
)
processed_images.append(pixel_values)
......@@ -197,8 +207,6 @@ class Ovis2_5Processor(ProcessorMixin):
for video in videos if isinstance(videos, list) else [videos]:
pixel_values, video_placeholders, grid = self.preprocess_multidata(
video=video,
min_pixels=min_pixels,
max_pixels=max_pixels,
**output_kwargs["videos_kwargs"],
)
processed_videos.append(pixel_values)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment