Unverified Commit 9e0a147d authored by Roger Wang's avatar Roger Wang Committed by GitHub
Browse files

[V1] Update interface for mistral-format Pixtral (#10703)


Signed-off-by: default avatarRoger Wang <ywang@roblox.com>
parent 418cb3b9
...@@ -31,7 +31,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader ...@@ -31,7 +31,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.utils import merge_multimodal_embeddings from vllm.model_executor.models.utils import merge_multimodal_embeddings
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
from vllm.multimodal.inputs import PlaceholderRange from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
from vllm.multimodal.utils import (cached_get_tokenizer, from vllm.multimodal.utils import (cached_get_tokenizer,
consecutive_placeholder_ranges, consecutive_placeholder_ranges,
resolve_visual_encoder_outputs) resolve_visual_encoder_outputs)
...@@ -190,6 +190,25 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -190,6 +190,25 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
return get_sampler() return get_sampler()
def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
vision_embeddings = self._process_image_input(image_input)
return vision_embeddings
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[NestedTensors] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.vision_args.image_token_id)
return inputs_embeds
def forward( def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
...@@ -197,31 +216,21 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -197,31 +216,21 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
kv_caches: List[torch.Tensor], kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata, attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None, intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs: object, **kwargs: object,
) -> Union[torch.Tensor, IntermediateTensors]: ) -> Union[torch.Tensor, IntermediateTensors]:
"""Run forward pass for pixtral. """Run forward pass for pixtral.
TODO
""" """
if intermediate_tensors is not None: if intermediate_tensors is not None:
input_ids = None
inputs_embeds = None inputs_embeds = None
else:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is not None:
vision_embeddings = self._process_image_input(image_input)
inputs_embeds = self.language_model.model.get_input_embeddings(
input_ids)
inputs_embeds = merge_multimodal_embeddings( # NOTE: In v1, inputs_embeds is always generated at model runner, this
input_ids, inputs_embeds, vision_embeddings, # condition is for v0 compatibility.
self.vision_args.image_token_id) elif inputs_embeds is None:
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
input_ids = None inputs_embeds = self.get_input_embeddings(input_ids,
else: vision_embeddings)
inputs_embeds = None input_ids = None
hidden_states = self.language_model.model(input_ids, hidden_states = self.language_model.model(input_ids,
positions, positions,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment