Unverified Commit 9d30de44 authored by tc-mb's avatar tc-mb Committed by GitHub
Browse files

[model] Support MiniCPM-V 4.5 (#23586)


Signed-off-by: default avatartc-mb <caitianchi@modelbest.cn>
Signed-off-by: default avatarXin Yang <xyangx@amazon.com>
Signed-off-by: default avatarAbatom <abzhonghua@gmail.com>
Signed-off-by: default avatarchzhang <chaojun.zhang@intel.com>
Signed-off-by: default avatarPate Motter <patemotter@google.com>
Signed-off-by: default avatarTerrencezzj <terrence@cohere.ai>
Signed-off-by: default avatarWoosuk Kwon <woosuk@thinkingmachines.ai>
Signed-off-by: default avatarsimon-mo <simon.mo@hey.com>
Signed-off-by: default avatarmgoin <mgoin64@gmail.com>
Signed-off-by: default avatarSiyuan Fu <siyuanf@nvidia.com>
Signed-off-by: default avatarsiyuanf <siyuanf@nvidia.com>
Signed-off-by: default avatarWeiliang Liu <weiliangl@nvidia.com>
Signed-off-by: default avatarMichael Goin <mgoin64@gmail.com>
Signed-off-by: default avataryewentao256 <zhyanwentao@126.com>
Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: default avatarLuka Govedič <ProExpertProg@users.noreply.github.com>
Signed-off-by: default avatarZijing Liu <liuzijing2014@gmail.com>
Signed-off-by: default avatarZijing Liu <liuzijing2014@users.noreply.github.com>
Signed-off-by: default avatarjiabin.00 <jiabin.00@bytedance.com>
Signed-off-by: default avatarzjy0516 <riverclouds.zhu@qq.com>
Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: default avatarJee Jee Li <pandaleefree@gmail.com>
Signed-off-by: default avatartc-mb <157115220+tc-mb@users.noreply.github.com>
Signed-off-by: default avatarRoger Wang <hey@rogerw.me>
Signed-off-by: default avatarRoger Wang <hey@rogerw.io>
Signed-off-by: default avatarHuy Do <huydhn@gmail.com>
Signed-off-by: default avatarMatúš Námešný <matus.namesny@ameria.com>
Signed-off-by: default avatarGuillaume Calmettes <gcalmettes@scaleway.com>
Signed-off-by: default avatarChen Zhang <zhangch99@outlook.com>
Signed-off-by: default avataroye93 <en.ouyang93@outlook.com>
Signed-off-by: default avatarJulien Lin <jullin@nvidia.com>
Signed-off-by: default avatarDidier Durand <durand.didier@gmail.com>
Signed-off-by: default avatarTianyu Li <tianyu.li@arm.com>
Signed-off-by: default avatarHongxia Yang <hongxia.yang@amd.com>
Signed-off-by: default avatarYuekai Zhang <zhangyuekai@foxmail.com>
Signed-off-by: default avatarvllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: default avatarjiang1.li <jiang1.li@intel.com>
Signed-off-by: default avatarZerohertz <ohg3417@gmail.com>
Signed-off-by: default avatarHyogeun Oh (오효근) <ohg3417@gmail.com>
Signed-off-by: default avatarThomas Parnell <tpa@zurich.ibm.com>
Signed-off-by: default avatarRussell Bryant <rbryant@redhat.com>
Signed-off-by: default avatarIsotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: default avatarHuzaifa Sidhpurwala <huzaifas@redhat.com>
Signed-off-by: default avatarFederico <65908512+coval3nte@users.noreply.github.com>
Signed-off-by: default avatarZixuan Zhang <zixuanzhang@bytedance.com>
Signed-off-by: default avatarwuhang <wuhang6@huawei.com>
Signed-off-by: default avatarczhu-cohere <conway.zhu@cohere.com>
Signed-off-by: default avatarWei Wei <wwei6@meta.com>
Signed-off-by: default avatarYiheng Xu <charlesyihengxu@gmail.com>
Signed-off-by: default avatarChenheli Hua <huachenheli@outlook.com>
Signed-off-by: default avatarwangyafeng <wangyafeng@baidu.com>
Co-authored-by: default avatarXin Yang <105740670+xyang16@users.noreply.github.com>
Co-authored-by: default avatarWentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: default avatarZhonghua Deng <abzhonghua@gmail.com>
Co-authored-by: default avatarChaojun Zhang <chaojun.zhang@intel.com>
Co-authored-by: default avatarPate Motter <p@temotter.com>
Co-authored-by: default avatarTerrence Zhao <32208165+Terrencezzj@users.noreply.github.com>
Co-authored-by: default avatarWoosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: default avatarSimon Mo <simon.mo@hey.com>
Co-authored-by: default avatarMichael Goin <mgoin64@gmail.com>
Co-authored-by: default avatarweiliang <weiliangl@nvidia.com>
Co-authored-by: default avatarSiyuan Fu <siyuanf@nvidia.com>
Co-authored-by: default avatargemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: default avatarCyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: default avatarCopilot <198982749+Copilot@users.noreply.github.com>
Co-authored-by: default avatarProExpertProg <11367180+ProExpertProg@users.noreply.github.com>
Co-authored-by: default avatarLuka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: default avatarZijing Liu <liuzijing2014@users.noreply.github.com>
Co-authored-by: default avatarBin Jia <45593998+FoolPlayer@users.noreply.github.com>
Co-authored-by: default avatarJiangyun Zhu <riverclouds.zhu@qq.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: default avatarRaghavan <oneraghavan@gmail.com>
Co-authored-by: default avatarJee Jee Li <pandaleefree@gmail.com>
Co-authored-by: default avatarCyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: default avatarRoger Wang <hey@rogerw.io>
Co-authored-by: default avatarRoger Wang <hey@rogerw.me>
Co-authored-by: default avatarknlnguyen1802 <knlnguyen1802@gmail.com>
Co-authored-by: default avatarHuy Do <huydhn@gmail.com>
Co-authored-by: default avatarMatúš Námešný <matus@namesny.com>
Co-authored-by: default avatarGuillaume Calmettes <gcalmettes@scaleway.com>
Co-authored-by: default avatarChen Zhang <zhangch99@outlook.com>
Co-authored-by: default avatarEn Ouyang <en.ouyang93@outlook.com>
Co-authored-by: default avatarLi, Jiang <jiang1.li@intel.com>
Co-authored-by: default avatarnvjullin <jullin@nvidia.com>
Co-authored-by: default avatarDidier Durand <2927957+didier-durand@users.noreply.github.com>
Co-authored-by: default avatarTianyuLi0 <116711075+TianyuLi0@users.noreply.github.com>
Co-authored-by: default avatarHongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: default avatarYuekai Zhang <zhangyuekai@foxmail.com>
Co-authored-by: default avatarvllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: default avatarHyogeun Oh (오효근) <ohg3417@gmail.com>
Co-authored-by: default avatarThomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: default avatarRussell Bryant <rbryant@redhat.com>
Co-authored-by: default avatarLukas Geiger <lukas.geiger94@gmail.com>
Co-authored-by: default avatarIsotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: default avatarHuzaifa Sidhpurwala <huzaifas@redhat.com>
Co-authored-by: default avatarFederico <65908512+coval3nte@users.noreply.github.com>
Co-authored-by: default avatarzixuanzhang226 <zixuanzhang@bytedance.com>
Co-authored-by: default avatarwuhang <wuhang6@huawei.com>
Co-authored-by: default avataryzds <41983536+youzhedian@users.noreply.github.com>
Co-authored-by: default avatarhongchao <hongchao@msh.team>
Co-authored-by: default avatarczhu-cohere <conway.zhu@cohere.com>
Co-authored-by: default avatarWei <weiweinpu@gmail.com>
Co-authored-by: default avatarYiheng Xu <charlesyihengxu@gmail.com>
Co-authored-by: default avatarAaron Pham <contact@aarnphm.xyz>
Co-authored-by: default avatarChenheli Hua <huachenheli@outlook.com>
Co-authored-by: default avatarCSWYF3634076 <58356743+CSWYF3634076@users.noreply.github.com>
parent 1f7a9c95
...@@ -638,7 +638,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen ...@@ -638,7 +638,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ | | `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ |
| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ | | `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ |
| `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ | | `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, etc. | ✅︎ | | ✅︎ | | `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | | ✅︎ |
| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ | | `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ |
| `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | | | `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | |
......
...@@ -451,7 +451,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -451,7 +451,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6", "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
trust_remote_code=True), trust_remote_code=True),
"MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5", "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
extras={"2.6": "openbmb/MiniCPM-V-2_6", "4.0": "openbmb/MiniCPM-V-4"}, # noqa: E501 extras={"2.6": "openbmb/MiniCPM-V-2_6", "4.0": "openbmb/MiniCPM-V-4", "4.5": "openbmb/MiniCPM-V-4_5"}, # noqa: E501
trust_remote_code=True), trust_remote_code=True),
"MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo("MiniMaxAI/MiniMax-VL-01", # noqa: E501 "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo("MiniMaxAI/MiniMax-VL-01", # noqa: E501
trust_remote_code=True, trust_remote_code=True,
......
...@@ -27,12 +27,14 @@ import math ...@@ -27,12 +27,14 @@ import math
from collections import defaultdict from collections import defaultdict
from collections.abc import Iterable, Mapping, Sequence from collections.abc import Iterable, Mapping, Sequence
from functools import partial from functools import partial
from itertools import chain
from typing import Annotated, Any, Callable, Literal, Optional, Union from typing import Annotated, Any, Callable, Literal, Optional, Union
import numpy as np import numpy as np
import torch import torch
import torch.types import torch.types
from torch import nn from torch import nn
from torch.nn.init import trunc_normal_
from transformers import BatchFeature, PretrainedConfig from transformers import BatchFeature, PretrainedConfig
from typing_extensions import TypeVar from typing_extensions import TypeVar
...@@ -47,10 +49,11 @@ from vllm.model_executor.models.llama import LlamaForCausalLM ...@@ -47,10 +49,11 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.model_executor.models.minicpm import MiniCPMForCausalLM from vllm.model_executor.models.minicpm import MiniCPMForCausalLM
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
NestedTensors) MultiModalKwargsItems, NestedTensors)
from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem, from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
ImageProcessorItems, ImageSize, ImageProcessorItems, ImageSize,
ModalityData, ModalityDataItems, ModalityData, ModalityDataItems,
...@@ -218,6 +221,187 @@ class Resampler2_5(BaseResampler): ...@@ -218,6 +221,187 @@ class Resampler2_5(BaseResampler):
return x return x
class Resampler4_5(Resampler2_5):
def __init__(self,
num_queries: int,
embed_dim: int,
num_heads: int,
kv_dim: Optional[int] = None,
norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
max_size: tuple[int, int] = (70, 70),
max_temporal_size: int = 36000,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "") -> None:
super().__init__(num_queries,
embed_dim,
num_heads,
kv_dim,
norm_layer,
max_size,
quant_config=quant_config,
prefix=prefix)
trunc_normal_(self.query, std=.02)
self.max_temporal_size = max_temporal_size
self._set_temporal_pos_cache(self.max_temporal_size)
self.apply(self._init_weights)
def get_1d_sincos_pos_embed_from_temporal_size(self, embed_dim: int,
pos: np.ndarray):
"""
embed_dim: output dimension for each position
pos: a list of positions to be encoded: size (M,)
out: (M, D)
"""
assert embed_dim % 2 == 0
omega = np.arange(embed_dim // 2, dtype=np.float32)
omega /= embed_dim / 2.
omega = 1. / 10000**omega # (D/2,)
pos = pos.reshape(-1) # (M,)
out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
emb_sin = np.sin(out) # (M, D/2)
emb_cos = np.cos(out) # (M, D/2)
emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
return emb
def _set_temporal_pos_cache(self,
max_temporal_size: int,
device: torch.types.Device = "cpu") -> None:
temporal_size = np.arange(max_temporal_size, dtype=np.float32)
pos_embed = torch.from_numpy(
self.get_1d_sincos_pos_embed_from_temporal_size(
self.embed_dim, temporal_size)).float().to(device)
self.register_buffer("temporal_pos_embed", pos_embed, persistent=False)
def _adjust_temporal_pos_cache(self,
max_temporal_size: int,
device: torch.types.Device = "cpu"):
if max_temporal_size > self.max_temporal_size:
self.max_temporal_size = max_temporal_size
self._set_temporal_pos_cache(self.max_temporal_size, device)
def _init_weights(self, m: Union[nn.Linear, nn.LayerNorm]):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
def forward(
self,
x: torch.Tensor,
tgt_sizes: torch.Tensor,
# temporal_ids for high refresh rate videos
temporal_ids=None
) -> torch.Tensor:
assert x.shape[0] == tgt_sizes.shape[0]
bs = x.shape[0]
device = x.device
dtype = x.dtype
patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]
self._adjust_pos_cache(tgt_sizes, device=device)
temporal_pos_emb = False
temporal_ids_flatten = None
if temporal_ids is not None:
# example: [[-1], [-1], [2, 6, 9]]
temporal_ids_flatten = list(chain.from_iterable(temporal_ids))
max_temporal_size = max(temporal_ids_flatten, default=0)
if max_temporal_size > -1:
temporal_pos_emb = True
if max_temporal_size > self.max_temporal_size:
self._adjust_temporal_pos_cache(max_temporal_size, device)
max_patch_len = patch_len.max().item()
assert isinstance(max_patch_len, int)
key_padding_mask = torch.zeros((bs, max_patch_len),
dtype=torch.bool,
device=device)
x, _ = self.kv_proj(x) # B * L * D
x = self.ln_kv(x).permute(1, 0, 2) # L * B * D
q = self.ln_q(self.query) # Q * D
pos_embed_2d = []
pos_embed_temporal = []
for i in range(bs):
tgt_h, tgt_w = tgt_sizes[i]
if temporal_pos_emb:
if temporal_ids_flatten[i] == -1:
pos_embed_temporal.append(
torch.zeros(self.embed_dim, dtype=dtype,
device=device))
else:
pos_embed_temporal.append(self.temporal_pos_embed[
temporal_ids_flatten[i]].to(dtype)) # D
pos_embed_2d.append(self.pos_embed[:tgt_h, :tgt_w, :].reshape(
(tgt_h * tgt_w, -1)).to(dtype)) # patches * D
key_padding_mask[i, patch_len[i]:] = True
pos_embed_2d = torch.nn.utils.rnn.pad_sequence(
pos_embed_2d, batch_first=True,
padding_value=0.0).permute(1, 0, 2) # BLD => L * B * D
k = x
v = x + pos_embed_2d
if pos_embed_temporal:
k += torch.stack(pos_embed_temporal, dim=0)
bs = len(temporal_ids)
merge_k = []
merge_v = []
merge_key_padding_mask = []
start = 0
for tp in temporal_ids:
end = start + len(tp)
# L * (end-start) * D -> (end-start) * L * D
# -> 1 * L*(end-start) * D
merge_k.append(k[:, start:end, :].permute(1, 0, 2).reshape(
-1, self.embed_dim))
merge_v.append(v[:, start:end, :].permute(1, 0, 2).reshape(
-1, self.embed_dim))
merge_key_padding_mask.append(
key_padding_mask[start:end, :].reshape(-1, 1))
start = end
k = torch.nn.utils.rnn.pad_sequence(merge_k,
batch_first=True,
padding_value=0.0).permute(
1, 0, 2) # L*(end-start)
v = torch.nn.utils.rnn.pad_sequence(merge_v,
batch_first=True,
padding_value=0.0).permute(
1, 0, 2) # L*(end-start)
key_padding_mask = torch.nn.utils.rnn.pad_sequence(
merge_key_padding_mask, batch_first=True,
padding_value=True).squeeze(-1)
out = self.attn(
self._repeat(q, bs), # Q * B * D
k, # L * B * D + L * B * D
v,
key_padding_mask=key_padding_mask,
)[0]
# out: Q * B * D
x = out.permute(1, 0, 2) # B * Q * D
x = self.ln_post(x)
x = x @ self.proj
return x
def get_version_by_config(config: PretrainedConfig) -> tuple[int, ...]: def get_version_by_config(config: PretrainedConfig) -> tuple[int, ...]:
version_float = getattr(config, "version", None) version_float = getattr(config, "version", None)
...@@ -354,9 +538,7 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo): ...@@ -354,9 +538,7 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
mm_limits = {"image": None} mm_limits = {"image": None}
if self.get_model_version() == (2, if self.get_model_version() in {(2, 6), (4, 0), (4, 5)}:
6) or self.get_model_version() == (4,
0):
mm_limits["video"] = None mm_limits["video"] = None
return mm_limits return mm_limits
...@@ -637,8 +819,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]): ...@@ -637,8 +819,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
out_keys: set[str], out_keys: set[str],
) -> dict[str, NestedTensors]: ) -> dict[str, NestedTensors]:
# This processor supports zipping prompt and mm_data together # This processor supports zipping prompt and mm_data together
if self.info.get_model_version() == ( if self.info.get_model_version() in {(2, 6), (4, 0), (4, 5)}:
2, 6) or self.info.get_model_version() == (4, 0):
inputs = super()._call_hf_processor( inputs = super()._call_hf_processor(
prompt=prompts, # type: ignore prompt=prompts, # type: ignore
mm_data=mm_data, mm_data=mm_data,
...@@ -816,7 +997,6 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): ...@@ -816,7 +997,6 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
# and config class # and config class
self.config = config self.config = config
self.multimodal_config = multimodal_config self.multimodal_config = multimodal_config
self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
self.version = get_version_by_config(self.config) self.version = get_version_by_config(self.config)
self.llm = self.init_llm(vllm_config=vllm_config, self.llm = self.init_llm(vllm_config=vllm_config,
...@@ -1364,11 +1544,9 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA): ...@@ -1364,11 +1544,9 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
prefix: str = "", prefix: str = "",
) -> nn.Module: ) -> nn.Module:
quant_config = self._maybe_ignore_quant_config(quant_config) quant_config = self._maybe_ignore_quant_config(quant_config)
model = Idefics2VisionTransformer( model = Idefics2VisionTransformer(config.vision_config,
config.vision_config, quant_config=quant_config,
quant_config=quant_config, prefix=prefix)
prefix=prefix,
use_data_parallel=self.use_data_parallel)
if self.config.drop_vision_last_layer: if self.config.drop_vision_last_layer:
model.encoder.layers = model.encoder.layers[:-1] model.encoder.layers = model.encoder.layers[:-1]
return model return model
...@@ -1436,11 +1614,121 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA): ...@@ -1436,11 +1614,121 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
return loader.load_weights(weights) return loader.load_weights(weights)
class MiniCPMV4_5(MiniCPMVBaseModel, SupportsLoRA):
packed_modules_mapping = {
"qkv_proj": [
"q_proj",
"k_proj",
"v_proj",
],
"gate_up_proj": [
"gate_proj",
"up_proj",
],
}
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__(vllm_config=vllm_config, prefix=prefix)
assert self.version == (4, 5)
def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
if isinstance(quant_config, (AWQConfig, AWQMarlinConfig)):
return None
return quant_config
def init_llm(
self,
vllm_config: VllmConfig,
prefix: str = "",
) -> nn.Module:
return Qwen3ForCausalLM(vllm_config=vllm_config, prefix=prefix)
def init_vision_module(
self,
config: PretrainedConfig,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
) -> nn.Module:
quant_config = self._maybe_ignore_quant_config(quant_config)
model = Idefics2VisionTransformer(config.vision_config,
quant_config=quant_config,
prefix=prefix)
if self.config.drop_vision_last_layer:
model.encoder.layers = model.encoder.layers[:-1]
return model
def init_resampler(
self,
embed_dim: int,
vision_dim: int,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
) -> nn.Module:
quant_config = self._maybe_ignore_quant_config(quant_config)
with set_default_torch_dtype(torch.float16):
# The resampler in 4.0 remains consistent with the one in 2.5/2.6.
resampler = Resampler4_5(num_queries=self.config.query_num,
embed_dim=embed_dim,
num_heads=embed_dim // 128,
kv_dim=vision_dim,
quant_config=quant_config,
prefix=prefix)
return resampler.to(device=current_platform.device_type,
dtype=torch.get_default_dtype())
def get_vision_hidden_states(
self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
pixel_values = data["pixel_values"]
tgt_sizes = data["tgt_sizes"]
temporal_ids = data.get('temporal_ids', None)
B = len(pixel_values)
P = pixel_values[0].shape[-2]
L = max(item.shape[-1] for item in pixel_values)
device = pixel_values[0].device
dtype = pixel_values[0].dtype
all_pixel_values = torch.zeros((B, 3, P, L),
dtype=dtype,
device=device)
all_temporal_ids = None if temporal_ids is None else flatten_2d_lists(
temporal_ids)
for i, pixel_values_item in enumerate(pixel_values):
L_item = pixel_values_item.shape[-1]
all_pixel_values[i, ..., :L_item] = pixel_values_item
num_patches = tgt_sizes.prod(-1)
max_patches = num_patches.max().item()
assert isinstance(max_patches, int)
patch_attn_mask = torch.zeros((B, max_patches),
dtype=torch.bool,
device=device)
for i, num_patches_item in enumerate(num_patches):
patch_attn_mask[i, :num_patches_item] = True
vision_embedding = self.vpm(
all_pixel_values,
patch_attention_mask=patch_attn_mask.unsqueeze(1),
tgt_sizes=tgt_sizes,
)
return self.resampler(vision_embedding, tgt_sizes, all_temporal_ids)
def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]:
loader = AutoWeightsLoader(self,
skip_prefixes=["apm.", "audio", "tts"])
return loader.load_weights(weights)
_SUPPORT_VERSION = { _SUPPORT_VERSION = {
(2, 0): MiniCPMV2_0, (2, 0): MiniCPMV2_0,
(2, 5): MiniCPMV2_5, (2, 5): MiniCPMV2_5,
(2, 6): MiniCPMV2_6, (2, 6): MiniCPMV2_6,
(4, 0): MiniCPMV4_0, (4, 0): MiniCPMV4_0,
(4, 5): MiniCPMV4_5,
} }
......
...@@ -20,6 +20,16 @@ def _get_qwen_chat_template_fallback( ...@@ -20,6 +20,16 @@ def _get_qwen_chat_template_fallback(
return CHAT_TEMPLATES_DIR / "template_basic.jinja" return CHAT_TEMPLATES_DIR / "template_basic.jinja"
def _get_minicpmv_chat_template_fallback(
tokenizer_name_or_path: str) -> Optional[Path]:
# MiniCPM-V-4.5 version uses a dedicated template
if "4.5" in tokenizer_name_or_path or "4_5" in tokenizer_name_or_path:
return CHAT_TEMPLATES_DIR / "template_minicpmv45.jinja"
# Other versions use chatml template
return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
# yapf: disable # yapf: disable
_MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = { _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
"blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja", "blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja",
...@@ -27,6 +37,7 @@ _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = { ...@@ -27,6 +37,7 @@ _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
"deepseek_vl_v2": CHAT_TEMPLATES_DIR / "template_deepseek_vl2.jinja", "deepseek_vl_v2": CHAT_TEMPLATES_DIR / "template_deepseek_vl2.jinja",
"florence2": CHAT_TEMPLATES_DIR / "template_basic.jinja", "florence2": CHAT_TEMPLATES_DIR / "template_basic.jinja",
"fuyu": CHAT_TEMPLATES_DIR / "template_fuyu.jinja", "fuyu": CHAT_TEMPLATES_DIR / "template_fuyu.jinja",
"minicpmv": _get_minicpmv_chat_template_fallback,
"paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja", "paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja",
"qwen": _get_qwen_chat_template_fallback, "qwen": _get_qwen_chat_template_fallback,
} }
......
{%- set enable_thinking = enable_thinking | default(false) %}
{%- if tools %}
{{- '<|im_start|>system\n' }}
{%- if messages[0].role == 'system' %}
{{- messages[0].content + '\n\n' }}
{%- endif %}
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
{%- for tool in tools %}
{{- "\n" }}
{{- tool | tojson }}
{%- endfor %}
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
{%- else %}
{%- if messages[0].role == 'system' %}
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
{%- for message in messages[::-1] %}
{%- set index = (messages|length - 1) - loop.index0 %}
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
{%- set ns.multi_step_tool = false %}
{%- set ns.last_query_index = index %}
{%- endif %}
{%- endfor %}
{%- for message in messages %}
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
{%- elif message.role == "assistant" %}
{%- set content = message.content %}
{%- set reasoning_content = '' %}
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
{%- set reasoning_content = message.reasoning_content %}
{%- else %}
{%- if '</think>' in message.content %}
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
{%- endif %}
{%- endif %}
{%- if loop.index0 > ns.last_query_index %}
{%- if loop.last or (not loop.last and reasoning_content) %}
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
{%- else %}
{{- '<|im_start|>' + message.role + '\n' + content }}
{%- endif %}
{%- else %}
{{- '<|im_start|>' + message.role + '\n' + content }}
{%- endif %}
{%- if message.tool_calls %}
{%- for tool_call in message.tool_calls %}
{%- if (loop.first and content) or (not loop.first) %}
{{- '\n' }}
{%- endif %}
{%- if tool_call.function %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{{- '<tool_call>\n{"name": "' }}
{{- tool_call.name }}
{{- '", "arguments": ' }}
{%- if tool_call.arguments is string %}
{{- tool_call.arguments }}
{%- else %}
{{- tool_call.arguments | tojson }}
{%- endif %}
{{- '}\n</tool_call>' }}
{%- endfor %}
{%- endif %}
{{- '<|im_end|>\n' }}
{%- elif message.role == "tool" %}
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
{{- '<|im_start|>user' }}
{%- endif %}
{{- '\n<tool_response>\n' }}
{{- message.content }}
{{- '\n</tool_response>' }}
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
{{- '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|im_start|>assistant\n' }}
{%- if enable_thinking is defined and enable_thinking is false %}
{{- '<think>\n\n</think>\n\n' }}
{%- endif %}
{%- if enable_thinking is defined and enable_thinking is true %}
{{- '<think>\n' }}
{%- endif %}
{%- endif %}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment