Unverified Commit bcf0731a authored by myselvess's avatar myselvess Committed by GitHub
Browse files

[New Model] support new model ovis2.6 (#34426)


Signed-off-by: default avatarmyselvess <23743269+myselvess@users.noreply.github.com>
parent ec090c24
...@@ -728,6 +728,8 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen ...@@ -728,6 +728,8 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `OpenPanguVLForConditionalGeneration` | openpangu-VL | T + I<sup>E+</sup> + V<sup>E+</sup> |`FreedomIntelligence/openPangu-VL-7B` | ✅︎ | ✅︎ | | `OpenPanguVLForConditionalGeneration` | openpangu-VL | T + I<sup>E+</sup> + V<sup>E+</sup> |`FreedomIntelligence/openPangu-VL-7B` | ✅︎ | ✅︎ |
| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | | `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ |
| `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | | | `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | |
| `Ovis2_6ForCausalLM` | Ovis2.6 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.6-2B`, etc. | | |
| `Ovis2_6_MoeForCausalLM` | Ovis2.6 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.6-30B-A3B`, etc. | | |
| `PaddleOCRVLForConditionalGeneration` | Paddle-OCR | T + I<sup>+</sup> | `PaddlePaddle/PaddleOCR-VL`, etc. | | | | `PaddleOCRVLForConditionalGeneration` | Paddle-OCR | T + I<sup>+</sup> | `PaddlePaddle/PaddleOCR-VL`, etc. | | |
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | ✅︎ | ✅︎ | | `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | ✅︎ | ✅︎ |
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
......
...@@ -915,6 +915,12 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -915,6 +915,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
}, },
), ),
"Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True), "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True),
"Ovis2_6ForCausalLM": _HfExamplesInfo(
"AIDC-AI/Ovis2.6-2B", is_available_online=False, trust_remote_code=True
),
"Ovis2_6_MoeForCausalLM": _HfExamplesInfo(
"AIDC-AI/Ovis2.6-30B-A3B", trust_remote_code=True
),
"PaddleOCRVLForConditionalGeneration": _HfExamplesInfo( "PaddleOCRVLForConditionalGeneration": _HfExamplesInfo(
"PaddlePaddle/PaddleOCR-VL", "PaddlePaddle/PaddleOCR-VL",
trust_remote_code=True, trust_remote_code=True,
......
...@@ -42,21 +42,12 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape ...@@ -42,21 +42,12 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
IMAGE_TOKEN = "<image>" IMAGE_TOKEN = "<image>"
IMAGE_PLACEHOLDER_ID = 151669
VIDEO_TOKEN = "<video>" VIDEO_TOKEN = "<video>"
INDICATOR_IDS = [-301, -302, -303, -304] VIDEO_PLACEHOLDER_ID = 151670
INDICATOR_IDS = [151672, 151673, 151674, 151675]
IMAGE_PAD_TOKEN_MAP = { IMAGE_PAD_TOKEN_ID = 151655
"gemma2": "<unused0>", THINK_END_TOKEN_ID = 151668
"llama": "<|reserved_special_token_0|>",
"qwen2": "<|image_pad|>",
"qwen3": "<|image_pad|>",
}
IMAGE_PAD_TOKEN_ID_MAP = {
"gemma2": 7,
"llama": 128002,
"qwen2": 151655,
"qwen3": 151655,
}
class Ovis2_5ImagePatchInputs(TensorSchema): class Ovis2_5ImagePatchInputs(TensorSchema):
...@@ -187,17 +178,11 @@ class Ovis2_5ProcessingInfo(BaseProcessingInfo): ...@@ -187,17 +178,11 @@ class Ovis2_5ProcessingInfo(BaseProcessingInfo):
vit_config = self.get_hf_config().vit_config vit_config = self.get_hf_config().vit_config
return self.ctx.get_hf_processor( return self.ctx.get_hf_processor(
Ovis2_5Processor, Ovis2_5Processor,
image_pad_token=self.get_image_pad_token(),
patch_size=vit_config.patch_size, patch_size=vit_config.patch_size,
hidden_stride=vit_config.hidden_stride, hidden_stride=vit_config.hidden_stride,
temporal_patch_size=vit_config.temporal_patch_size, temporal_patch_size=vit_config.temporal_patch_size,
) )
def get_image_pad_token(self) -> str:
hf_text_config = self.get_hf_config().get_text_config()
text_model_type = hf_text_config.model_type
return IMAGE_PAD_TOKEN_MAP.get(text_model_type)
def get_image_processor(self) -> BaseImageProcessor: def get_image_processor(self) -> BaseImageProcessor:
return self.get_hf_processor().image_processor # type: ignore return self.get_hf_processor().image_processor # type: ignore
...@@ -342,9 +327,9 @@ class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo]) ...@@ -342,9 +327,9 @@ class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo])
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
vte_vocab_size = hf_config.visual_vocab_size vte_vocab_size = hf_config.visual_vocab_size
return [ return [
vte_vocab_size - len(INDICATOR_IDS) + abs(x + 300) - 1 vte_vocab_size - len(INDICATOR_IDS) + (x - INDICATOR_IDS[0])
for x in visual_indicators for x in visual_indicators
if x < -300 if x >= INDICATOR_IDS[0]
] ]
def _call_hf_processor( def _call_hf_processor(
...@@ -417,6 +402,14 @@ class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo]) ...@@ -417,6 +402,14 @@ class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo])
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargsItems, out_mm_kwargs: MultiModalKwargsItems,
) -> list[PromptReplacement]: ) -> list[PromptReplacement]:
tokenizer = self.info.get_tokenizer()
vocab = tokenizer.get_vocab()
placeholder = {
"image": vocab[IMAGE_TOKEN],
"video": vocab[VIDEO_TOKEN],
}
def get_replacement_ovis(item_idx, modality: str): def get_replacement_ovis(item_idx, modality: str):
if modality == "image": if modality == "image":
out_item = out_mm_kwargs["image"][item_idx] out_item = out_mm_kwargs["image"][item_idx]
...@@ -432,7 +425,7 @@ class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo]) ...@@ -432,7 +425,7 @@ class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo])
return [ return [
PromptReplacement( PromptReplacement(
modality=modality, modality=modality,
target=IMAGE_TOKEN if modality == "image" else VIDEO_TOKEN, target=[placeholder[modality]],
replacement=partial(get_replacement_ovis, modality=modality), replacement=partial(get_replacement_ovis, modality=modality),
) )
for modality in ("image", "video") for modality in ("image", "video")
...@@ -476,8 +469,7 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP): ...@@ -476,8 +469,7 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
) )
self.vte = VisualEmbedding(config.visual_vocab_size, config.hidden_size) self.vte = VisualEmbedding(config.visual_vocab_size, config.hidden_size)
text_model_type = self.config.get_text_config().model_type self.image_pad_token_id: int = IMAGE_PAD_TOKEN_ID
self.image_pad_token_id = IMAGE_PAD_TOKEN_ID_MAP[text_model_type]
self.make_empty_intermediate_tensors = ( self.make_empty_intermediate_tensors = (
self.get_language_model().make_empty_intermediate_tensors self.get_language_model().make_empty_intermediate_tensors
......
...@@ -425,6 +425,8 @@ _MULTIMODAL_MODELS = { ...@@ -425,6 +425,8 @@ _MULTIMODAL_MODELS = {
), ),
"Ovis": ("ovis", "Ovis"), "Ovis": ("ovis", "Ovis"),
"Ovis2_5": ("ovis2_5", "Ovis2_5"), "Ovis2_5": ("ovis2_5", "Ovis2_5"),
"Ovis2_6ForCausalLM": ("ovis2_5", "Ovis2_5"),
"Ovis2_6_MoeForCausalLM": ("ovis2_5", "Ovis2_5"),
"PaddleOCRVLForConditionalGeneration": ( "PaddleOCRVLForConditionalGeneration": (
"paddleocr_vl", "paddleocr_vl",
"PaddleOCRVLForConditionalGeneration", "PaddleOCRVLForConditionalGeneration",
......
...@@ -582,7 +582,6 @@ class Siglip2VisionTransformer(nn.Module): ...@@ -582,7 +582,6 @@ class Siglip2VisionTransformer(nn.Module):
hidden_states = self.embeddings(pixel_values, grid_thws) hidden_states = self.embeddings(pixel_values, grid_thws)
last_hidden_state = self.encoder(hidden_states, grid_thws) last_hidden_state = self.encoder(hidden_states, grid_thws)
last_hidden_state = self.post_layernorm(last_hidden_state)
return last_hidden_state return last_hidden_state
......
...@@ -78,17 +78,32 @@ class Ovis2_5Processor(ProcessorMixin): ...@@ -78,17 +78,32 @@ class Ovis2_5Processor(ProcessorMixin):
@cached_property @cached_property
def extra_special_tokens(self): def extra_special_tokens(self):
image_pad_token_id = self.tokenizer.get_vocab()[self.image_pad_token] vocab = self.tokenizer.get_vocab()
extra_special_tokens = { required_tokens = {
"image_token": -200, "image_token": "<image>",
"video_token": -201, "video_token": "<video>",
"visual_atom": -300, "visual_atom": "<ovis_visual_atom>",
"image_start": -301, "image_start": "<ovis_image_start>",
"image_end": -302, "image_end": "<ovis_image_end>",
"video_start": -303, "video_start": "<ovis_video_start>",
"video_end": -304, "video_end": "<ovis_video_end>",
"image_pad": image_pad_token_id, "image_pad": "<|image_pad|>",
} }
extra_special_tokens = {}
suggestion = (
"please add '<image>', '<video>', '<ovis_visual_atom>', "
"'<ovis_image_start>', '<ovis_image_end>', '<ovis_video_start>', "
"'<ovis_video_end>' in 'additional_special_tokens' of "
"tokenizer_config.json, You can refer to "
"https://huggingface.co/AIDC-AI/Ovis2.6-30B-A3B/blob/main/tokenizer_config.json"
)
for key, token_name in required_tokens.items():
if token_name not in vocab:
raise ValueError(f"Can not find {token_name}, {suggestion}")
extra_special_tokens[key] = vocab[token_name]
return extra_special_tokens return extra_special_tokens
def __call__( def __call__(
...@@ -156,9 +171,6 @@ class Ovis2_5Processor(ProcessorMixin): ...@@ -156,9 +171,6 @@ class Ovis2_5Processor(ProcessorMixin):
- **second_per_grid_ts** -- list of video seconds per time grid. - **second_per_grid_ts** -- list of video seconds per time grid.
Returned when `videos` is not `None`. Returned when `videos` is not `None`.
""" """
min_pixels = kwargs.pop("min_pixels", MIN_PIXELS)
max_pixels = kwargs.pop("max_pixels", MAX_PIXELS)
output_kwargs = self._merge_kwargs( output_kwargs = self._merge_kwargs(
Ovis2_5ProcessorKwargs, Ovis2_5ProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs,
...@@ -175,8 +187,6 @@ class Ovis2_5Processor(ProcessorMixin): ...@@ -175,8 +187,6 @@ class Ovis2_5Processor(ProcessorMixin):
for image in images if isinstance(images, list) else [images]: for image in images if isinstance(images, list) else [images]:
pixel_values, image_placeholders, grid = self.preprocess_multidata( pixel_values, image_placeholders, grid = self.preprocess_multidata(
images=image, images=image,
min_pixels=min_pixels,
max_pixels=max_pixels,
**output_kwargs["images_kwargs"], **output_kwargs["images_kwargs"],
) )
processed_images.append(pixel_values) processed_images.append(pixel_values)
...@@ -197,8 +207,6 @@ class Ovis2_5Processor(ProcessorMixin): ...@@ -197,8 +207,6 @@ class Ovis2_5Processor(ProcessorMixin):
for video in videos if isinstance(videos, list) else [videos]: for video in videos if isinstance(videos, list) else [videos]:
pixel_values, video_placeholders, grid = self.preprocess_multidata( pixel_values, video_placeholders, grid = self.preprocess_multidata(
video=video, video=video,
min_pixels=min_pixels,
max_pixels=max_pixels,
**output_kwargs["videos_kwargs"], **output_kwargs["videos_kwargs"],
) )
processed_videos.append(pixel_values) processed_videos.append(pixel_values)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment