Unverified Commit bd51f78e authored by Isotr0py's avatar Isotr0py Committed by GitHub
Browse files

[V0 Deprecation][Models] Remove all V0 condition for mm embeddings merge (#25331)


Signed-off-by: default avatarIsotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: default avatarisotr0py <2037008807@qq.com>
parent 65ecb4f1
...@@ -427,17 +427,6 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -427,17 +427,6 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif inputs_embeds is None:
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(
input_ids,
vision_embeddings,
is_multimodal=input_ids == self.config.image_token_index,
)
input_ids = None
hidden_states = self.language_model.model( hidden_states = self.language_model.model(
input_ids=input_ids, input_ids=input_ids,
positions=positions, positions=positions,
......
...@@ -672,17 +672,6 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, ...@@ -672,17 +672,6 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif inputs_embeds is None:
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(
input_ids,
vision_embeddings,
is_multimodal=input_ids == _IMAGE_TOKEN_ID,
)
input_ids = None
hidden_states = self.language_model.model(input_ids, hidden_states = self.language_model.model(input_ids,
positions, positions,
intermediate_tensors, intermediate_tensors,
......
...@@ -1014,18 +1014,6 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -1014,18 +1014,6 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif inputs_embeds is None:
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
image_token_id = self.model.vocabulary_mapping.image_token_id
inputs_embeds = self.get_input_embeddings(
input_ids,
vision_embeddings,
is_multimodal=input_ids == image_token_id,
)
input_ids = None
hidden_states = self.model(input_ids, hidden_states = self.model(input_ids,
positions, positions,
intermediate_tensors, intermediate_tensors,
......
...@@ -440,17 +440,6 @@ class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -440,17 +440,6 @@ class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal,
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif inputs_embeds is None:
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(
input_ids,
vision_embeddings,
is_multimodal=input_ids == self.config.image_token_id,
)
input_ids = None
hidden_states = self.language_model.model( hidden_states = self.language_model.model(
input_ids=input_ids, input_ids=input_ids,
positions=positions, positions=positions,
......
...@@ -614,17 +614,6 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): ...@@ -614,17 +614,6 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility
elif inputs_embeds is None:
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(
input_ids,
vision_embeddings,
is_multimodal=input_ids == self.image_token_id,
)
input_ids = None
hidden_states = self.language_model(input_ids, hidden_states = self.language_model(input_ids,
positions, positions,
intermediate_tensors, intermediate_tensors,
......
...@@ -352,17 +352,6 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): ...@@ -352,17 +352,6 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif inputs_embeds is None:
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(
input_ids,
vision_embeddings,
is_multimodal=input_ids == _IMAGE_TOKEN_ID,
)
input_ids = None
hidden_states = self.language_model( hidden_states = self.language_model(
input_ids=input_ids, input_ids=input_ids,
positions=positions, positions=positions,
......
...@@ -596,25 +596,6 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, ...@@ -596,25 +596,6 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif inputs_embeds is None:
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(
input_ids,
vision_embeddings,
is_multimodal=input_ids == self.config.image_token_index,
)
if (vision_embeddings is not None) and len(vision_embeddings) != 0:
kwargs = self.prepare_attn_masks(
input_ids,
positions,
mask_dtype=self.dtype,
**kwargs,
)
input_ids = None
hidden_states = self.language_model.model(input_ids, hidden_states = self.language_model.model(input_ids,
positions, positions,
intermediate_tensors, intermediate_tensors,
......
...@@ -71,7 +71,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, ...@@ -71,7 +71,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.platforms import _Backend from vllm.platforms import _Backend
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import uses_mrope
from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.tensor_schema import TensorSchema, TensorShape
from ..layers.activation import SiluAndMul from ..layers.activation import SiluAndMul
...@@ -80,8 +79,7 @@ from .interfaces import (MultiModalEmbeddings, SupportsLoRA, ...@@ -80,8 +79,7 @@ from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
from .qwen2_vl import (_create_qwen2vl_field_factory, from .qwen2_vl import (_create_qwen2vl_field_factory,
apply_rotary_pos_emb_vision) apply_rotary_pos_emb_vision)
from .utils import (AutoWeightsLoader, WeightsMapper, from .utils import (AutoWeightsLoader, WeightsMapper,
init_vllm_registered_model, maybe_prefix, init_vllm_registered_model, maybe_prefix)
merge_multimodal_embeddings)
from .vision import get_vit_attn_backend, run_dp_sharded_mrope_vision_model from .vision import get_vit_attn_backend, run_dp_sharded_mrope_vision_model
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -1552,32 +1550,6 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -1552,32 +1550,6 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
multimodal_embeddings += video_embeddings multimodal_embeddings += video_embeddings
return multimodal_embeddings return multimodal_embeddings
def get_input_embeddings_v0(
self,
input_ids: torch.Tensor,
image_input: Optional[Glm4vImageInputs] = None,
video_input: Optional[Glm4vVideoInputs] = None,
) -> torch.Tensor:
inputs_embeds = self.get_input_embeddings(input_ids)
if image_input is not None:
image_embeds = self._process_image_input(image_input)
inputs_embeds = merge_multimodal_embeddings(
input_ids,
inputs_embeds,
image_embeds,
placeholder_token_id=self.config.image_token_id,
)
if video_input is not None:
video_embeds = self._process_video_input(video_input)
inputs_embeds = merge_multimodal_embeddings(
input_ids,
inputs_embeds,
video_embeds,
placeholder_token_id=self.config.video_token_id,
)
return inputs_embeds
def forward( def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
...@@ -1604,26 +1576,6 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -1604,26 +1576,6 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner from
# `get_multimodal_embeddings` and `get_input_embeddings`, this
# condition is only for v0 compatibility.
elif inputs_embeds is None:
image_input = self._parse_and_validate_image_input(**kwargs)
video_input = self._parse_and_validate_video_input(**kwargs)
if image_input is None and video_input is None:
inputs_embeds = None
else:
if uses_mrope(self.config):
assert positions.ndim == 2 and positions.size(0) == 3, (
"multimodal section rotary embedding requires "
f"(3, seq_len) positions, but got {positions.size()}")
inputs_embeds = self.get_input_embeddings_v0(
input_ids,
image_input=image_input,
video_input=video_input)
input_ids = None
hidden_states = self.language_model.model( hidden_states = self.language_model.model(
input_ids=input_ids, input_ids=input_ids,
positions=positions, positions=positions,
......
...@@ -43,7 +43,7 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape ...@@ -43,7 +43,7 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .chatglm import ChatGLMBaseModel, ChatGLMModel from .chatglm import ChatGLMBaseModel, ChatGLMModel
from .interfaces import (MultiModalEmbeddings, SupportsLoRA, from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
SupportsMultiModal, SupportsPP) SupportsMultiModal, SupportsPP)
from .utils import flatten_bn, isin_list from .utils import flatten_bn
class GLMVImagePixelInputs(TensorSchema): class GLMVImagePixelInputs(TensorSchema):
...@@ -618,21 +618,6 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP, ...@@ -618,21 +618,6 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif inputs_embeds is None:
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(
input_ids,
vision_embeddings,
is_multimodal=isin_list(input_ids, [
self.config.boi_token_id,
self.config.pad_token_id,
self.config.eoi_token_id,
]),
)
input_ids = None
hidden_states = self.transformer(input_ids, positions, hidden_states = self.transformer(input_ids, positions,
intermediate_tensors, inputs_embeds) intermediate_tensors, inputs_embeds)
......
...@@ -765,17 +765,6 @@ class GraniteSpeechForConditionalGeneration( ...@@ -765,17 +765,6 @@ class GraniteSpeechForConditionalGeneration(
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif inputs_embeds is None:
audio_embeds = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(
input_ids,
audio_embeds,
is_multimodal=input_ids == self.config.audio_token_index,
)
input_ids = None
model_output = self.language_model(input_ids, positions, model_output = self.language_model(input_ids, positions,
intermediate_tensors, inputs_embeds) intermediate_tensors, inputs_embeds)
return model_output return model_output
......
...@@ -45,8 +45,7 @@ from vllm.sequence import IntermediateTensors ...@@ -45,8 +45,7 @@ from vllm.sequence import IntermediateTensors
from .clip import CLIPVisionModel from .clip import CLIPVisionModel
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
from .siglip import SiglipVisionModel from .siglip import SiglipVisionModel
from .utils import (AutoWeightsLoader, init_vllm_registered_model, isin_list, from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
maybe_prefix)
from .vision import get_vision_encoder_info from .vision import get_vision_encoder_info
EOT = "<|endofturn|>" EOT = "<|endofturn|>"
...@@ -747,18 +746,6 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): ...@@ -747,18 +746,6 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif inputs_embeds is None:
multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(
input_ids,
multimodal_embeddings,
is_multimodal=isin_list(
input_ids,
[self.config.image_token_id, self.config.video_token_id]),
)
input_ids = None
hidden_states = self.language_model.model(input_ids, hidden_states = self.language_model.model(input_ids,
positions, positions,
intermediate_tensors, intermediate_tensors,
......
...@@ -702,17 +702,6 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -702,17 +702,6 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif inputs_embeds is None:
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(
input_ids,
vision_embeddings,
is_multimodal=input_ids == self.config.image_token_id,
)
input_ids = None
hidden_states = self.model.text_model(input_ids, hidden_states = self.model.text_model(input_ids,
positions, positions,
intermediate_tensors, intermediate_tensors,
......
...@@ -40,7 +40,7 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape ...@@ -40,7 +40,7 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import (MultiModalEmbeddings, SupportsLoRA, from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
SupportsMultiModal, SupportsPP) SupportsMultiModal, SupportsPP)
from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
init_vllm_registered_model, isin_list, maybe_prefix) init_vllm_registered_model, maybe_prefix)
class InternS1MultiModalProjector(nn.Module): class InternS1MultiModalProjector(nn.Module):
...@@ -798,22 +798,6 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -798,22 +798,6 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
input_ids = None input_ids = None
inputs_embeds = None inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif inputs_embeds is None:
context_token_ids = [
token_id for token_id in (self.img_context_token_id,
self.video_context_token_id)
if token_id is not None
]
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(
input_ids,
vision_embeddings,
is_multimodal=isin_list(input_ids, context_token_ids),
)
input_ids = None
forward_kwargs = { forward_kwargs = {
"input_ids": input_ids, "input_ids": input_ids,
"positions": positions, "positions": positions,
......
...@@ -43,7 +43,7 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape ...@@ -43,7 +43,7 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import (MultiModalEmbeddings, SupportsLoRA, from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
SupportsMultiModal, SupportsPP) SupportsMultiModal, SupportsPP)
from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
isin_list, maybe_prefix) maybe_prefix)
IMG_START = '<img>' IMG_START = '<img>'
IMG_END = '</img>' IMG_END = '</img>'
...@@ -1371,22 +1371,6 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, ...@@ -1371,22 +1371,6 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
input_ids = None input_ids = None
inputs_embeds = None inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif inputs_embeds is None:
context_token_ids = [
token_id for token_id in (self.img_context_token_id,
self.video_context_token_id)
if token_id is not None
]
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(
input_ids,
vision_embeddings,
is_multimodal=isin_list(input_ids, context_token_ids),
)
input_ids = None
forward_kwargs = { forward_kwargs = {
"input_ids": input_ids, "input_ids": input_ids,
"positions": positions, "positions": positions,
......
...@@ -433,22 +433,6 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -433,22 +433,6 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
) -> IntermediateTensors: ) -> IntermediateTensors:
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner from
# `get_multimodal_embeddings` and `get_input_embeddings`, this
# condition is only for v0 compatibility.
elif inputs_embeds is None:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
inputs_embeds = None
else:
image_embeds = self._process_image_input(image_input)
inputs_embeds = self.get_input_embeddings(
input_ids,
image_embeds,
is_multimodal=input_ids ==
self.config.media_placeholder_token_id,
)
input_ids = None
hidden_states = self.language_model( hidden_states = self.language_model(
input_ids=input_ids, input_ids=input_ids,
......
...@@ -723,17 +723,6 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): ...@@ -723,17 +723,6 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif inputs_embeds is None:
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(
input_ids,
vision_embeddings,
is_multimodal=input_ids == self.config.image_token_index,
)
input_ids = None
hidden_states = self.language_model.model(input_ids, hidden_states = self.language_model.model(input_ids,
positions, positions,
intermediate_tensors, intermediate_tensors,
......
...@@ -547,17 +547,6 @@ model_executor.models.llava_next.LlavaNextProcessingInfo.get_num_image_tokens]. ...@@ -547,17 +547,6 @@ model_executor.models.llava_next.LlavaNextProcessingInfo.get_num_image_tokens].
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif inputs_embeds is None:
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(
input_ids,
vision_embeddings,
is_multimodal=input_ids == self.config.image_token_index,
)
input_ids = None
hidden_states = self.language_model.model(input_ids, hidden_states = self.language_model.model(input_ids,
positions, positions,
intermediate_tensors, intermediate_tensors,
......
...@@ -431,17 +431,6 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -431,17 +431,6 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif inputs_embeds is None:
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(
input_ids,
vision_embeddings,
is_multimodal=input_ids == self.config.video_token_index,
)
input_ids = None
hidden_states = self.language_model.model(input_ids, hidden_states = self.language_model.model(input_ids,
positions, positions,
intermediate_tensors, intermediate_tensors,
......
...@@ -30,8 +30,7 @@ from .llava_next import (BaseLlavaNextMultiModalProcessor, LlavaNextLikeConfig, ...@@ -30,8 +30,7 @@ from .llava_next import (BaseLlavaNextMultiModalProcessor, LlavaNextLikeConfig,
LlavaNextProcessingInfo) LlavaNextProcessingInfo)
from .siglip import SiglipVisionModel from .siglip import SiglipVisionModel
from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
init_vllm_registered_model, maybe_prefix, init_vllm_registered_model, maybe_prefix)
merge_multimodal_embeddings)
# For profile run # For profile run
_MAX_FRAMES_PER_VIDEO = 16 _MAX_FRAMES_PER_VIDEO = 16
...@@ -850,33 +849,6 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -850,33 +849,6 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
return multimodal_embeddings return multimodal_embeddings
def get_input_embeddings_v0(
self,
input_ids: torch.Tensor,
image_input: Optional[LlavaOnevisionImagePixelInputs] = None,
video_input: Optional[LlavaOnevisionVideoPixelInputs] = None,
) -> torch.Tensor:
inputs_embeds = self.get_input_embeddings(input_ids)
if image_input is not None:
image_embeds = self._process_image_input(image_input)
inputs_embeds = merge_multimodal_embeddings(
input_ids,
inputs_embeds,
image_embeds,
placeholder_token_id=self.config.image_token_index,
)
if video_input is not None:
video_embeds = self._process_video_pixels(video_input)
inputs_embeds = merge_multimodal_embeddings(
input_ids,
inputs_embeds,
video_embeds,
placeholder_token_id=self.config.video_token_index,
)
return inputs_embeds
def forward( def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
...@@ -894,22 +866,6 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -894,22 +866,6 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner from
# `get_multimodal_embeddings` and `get_input_embeddings`, this
# condition is only for v0 compatibility.
elif inputs_embeds is None:
image_input = self._parse_and_validate_image_input(**kwargs)
video_input = self._parse_and_validate_video_input(**kwargs)
if image_input is None and video_input is None:
inputs_embeds = None
else:
inputs_embeds = self.get_input_embeddings_v0(
input_ids,
image_input=image_input,
video_input=video_input)
input_ids = None
hidden_states = self.language_model.model(input_ids, hidden_states = self.language_model.model(input_ids,
positions, positions,
intermediate_tensors, intermediate_tensors,
......
...@@ -71,7 +71,7 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape ...@@ -71,7 +71,7 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .idefics2_vision_model import Idefics2VisionTransformer from .idefics2_vision_model import Idefics2VisionTransformer
from .interfaces import (MultiModalEmbeddings, SupportsLoRA, from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
SupportsMultiModal, SupportsPP) SupportsMultiModal, SupportsPP)
from .utils import AutoWeightsLoader, flatten_bn, isin_list, maybe_prefix from .utils import AutoWeightsLoader, flatten_bn, maybe_prefix
# For profile run # For profile run
_MAX_FRAMES_PER_VIDEO = 16 _MAX_FRAMES_PER_VIDEO = 16
...@@ -1154,19 +1154,6 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): ...@@ -1154,19 +1154,6 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner from
# `get_multimodal_embeddings` and `get_input_embeddings`, this
# condition is only for v0 compatibility.
elif inputs_embeds is None:
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(
input_ids,
vision_embeddings,
is_multimodal=isin_list(input_ids, list(self.mm_token_ids)),
)
input_ids = None
hidden_states = self.llm.model( hidden_states = self.llm.model(
input_ids=input_ids, input_ids=input_ids,
positions=positions, positions=positions,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment