Unverified Commit 97d1c993 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Rename clashing method names for vLLM model protocol (#27583)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 32262834
...@@ -279,7 +279,7 @@ class EagleProposer: ...@@ -279,7 +279,7 @@ class EagleProposer:
if self.supports_mm_inputs: if self.supports_mm_inputs:
mm_embeds, is_mm_embed = mm_embed_inputs or (None, None) mm_embeds, is_mm_embed = mm_embed_inputs or (None, None)
self.inputs_embeds[:num_tokens] = self.model.get_input_embeddings( self.inputs_embeds[:num_tokens] = self.model.embed_input_ids(
self.input_ids[:num_tokens], self.input_ids[:num_tokens],
multimodal_embeddings=mm_embeds, multimodal_embeddings=mm_embeds,
is_multimodal=is_mm_embed, is_multimodal=is_mm_embed,
...@@ -447,9 +447,7 @@ class EagleProposer: ...@@ -447,9 +447,7 @@ class EagleProposer:
self._set_positions(batch_size, clamped_positions) self._set_positions(batch_size, clamped_positions)
self.hidden_states[:batch_size] = hidden_states self.hidden_states[:batch_size] = hidden_states
if self.supports_mm_inputs: if self.supports_mm_inputs:
self.inputs_embeds[:batch_size] = self.model.get_input_embeddings( self.inputs_embeds[:batch_size] = self.model.embed_input_ids(input_ids)
input_ids
)
input_ids = None input_ids = None
inputs_embeds = self.inputs_embeds[:input_batch_size] inputs_embeds = self.inputs_embeds[:input_batch_size]
...@@ -972,9 +970,7 @@ class EagleProposer: ...@@ -972,9 +970,7 @@ class EagleProposer:
# text-only draft models # text-only draft models
try: try:
dummy_input_ids = torch.tensor([[1]], device=self.input_ids.device) dummy_input_ids = torch.tensor([[1]], device=self.input_ids.device)
self.model.get_input_embeddings( self.model.embed_input_ids(dummy_input_ids, multimodal_embeddings=None)
dummy_input_ids, multimodal_embeddings=None
)
except (NotImplementedError, AttributeError, TypeError): except (NotImplementedError, AttributeError, TypeError):
logger.warning( logger.warning(
"Draft model does not support multimodal inputs, " "Draft model does not support multimodal inputs, "
......
...@@ -1853,7 +1853,7 @@ class GPUModelRunner( ...@@ -1853,7 +1853,7 @@ class GPUModelRunner(
) )
) )
micro_batch_outputs = model.get_multimodal_embeddings( micro_batch_outputs = model.embed_multimodal(
**micro_batch_mm_inputs **micro_batch_mm_inputs
) )
...@@ -1866,7 +1866,7 @@ class GPUModelRunner( ...@@ -1866,7 +1866,7 @@ class GPUModelRunner(
# 2. A list or tuple (length: num_items) of tensors, # 2. A list or tuple (length: num_items) of tensors,
# each of shape (feature_size, hidden_size) in case the feature # each of shape (feature_size, hidden_size) in case the feature
# size is dynamic depending on the input multimodal items. # size is dynamic depending on the input multimodal items.
curr_group_outputs = model.get_multimodal_embeddings(**mm_kwargs_group) curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
sanity_check_mm_encoder_outputs( sanity_check_mm_encoder_outputs(
curr_group_outputs, curr_group_outputs,
...@@ -2225,7 +2225,7 @@ class GPUModelRunner( ...@@ -2225,7 +2225,7 @@ class GPUModelRunner(
# NOTE(woosuk): To unify token ids and soft tokens (vision # NOTE(woosuk): To unify token ids and soft tokens (vision
# embeddings), we always use embeddings (rather than token ids) # embeddings), we always use embeddings (rather than token ids)
# as input to the multimodal model, even when the input is text. # as input to the multimodal model, even when the input is text.
inputs_embeds_scheduled = self.model.get_input_embeddings( inputs_embeds_scheduled = self.model.embed_input_ids(
self.input_ids.gpu[:num_scheduled_tokens], self.input_ids.gpu[:num_scheduled_tokens],
multimodal_embeddings=mm_embeds, multimodal_embeddings=mm_embeds,
is_multimodal=is_mm_embed, is_multimodal=is_mm_embed,
...@@ -2261,7 +2261,7 @@ class GPUModelRunner( ...@@ -2261,7 +2261,7 @@ class GPUModelRunner(
# Some tokens ids may need to become embeds # Some tokens ids may need to become embeds
if token_ids_idx.numel() > 0: if token_ids_idx.numel() > 0:
token_ids = self.input_ids.gpu[token_ids_idx] token_ids = self.input_ids.gpu[token_ids_idx]
tokens_to_embeds = self.model.get_input_embeddings(input_ids=token_ids) tokens_to_embeds = self.model.embed_input_ids(input_ids=token_ids)
self.inputs_embeds.gpu[token_ids_idx] = tokens_to_embeds self.inputs_embeds.gpu[token_ids_idx] = tokens_to_embeds
inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens] inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens]
...@@ -3889,7 +3889,7 @@ class GPUModelRunner( ...@@ -3889,7 +3889,7 @@ class GPUModelRunner(
) )
# Run multimodal encoder. # Run multimodal encoder.
dummy_encoder_outputs = self.model.get_multimodal_embeddings( dummy_encoder_outputs = self.model.embed_multimodal(
**batched_dummy_mm_inputs **batched_dummy_mm_inputs
) )
......
...@@ -962,7 +962,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -962,7 +962,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# (feature_size, hidden_size) in case the feature size is dynamic # (feature_size, hidden_size) in case the feature size is dynamic
# depending on the input multimodal items. # depending on the input multimodal items.
torch_xla.sync(wait=False) torch_xla.sync(wait=False)
curr_group_outputs = model.get_multimodal_embeddings(**mm_kwargs_group) curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
torch_xla.sync(wait=False) torch_xla.sync(wait=False)
sanity_check_mm_encoder_outputs( sanity_check_mm_encoder_outputs(
...@@ -1065,7 +1065,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -1065,7 +1065,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# NOTE(woosuk): To unify token ids and soft tokens (vision # NOTE(woosuk): To unify token ids and soft tokens (vision
# embeddings), we always use embeddings (rather than token ids) # embeddings), we always use embeddings (rather than token ids)
# as input to the multimodal model, even when the input is text. # as input to the multimodal model, even when the input is text.
inputs_embeds = self.model.get_input_embeddings( inputs_embeds = self.model.embed_input_ids(
input_ids, input_ids,
multimodal_embeddings=mm_embeds, multimodal_embeddings=mm_embeds,
is_multimodal=is_mm_embed, is_multimodal=is_mm_embed,
...@@ -1484,14 +1484,12 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -1484,14 +1484,12 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
) )
# Run multimodal encoder. # Run multimodal encoder.
torch_xla.sync(wait=False) torch_xla.sync(wait=False)
mm_embeds = self.model.get_multimodal_embeddings( mm_embeds = self.model.embed_multimodal(**batched_dummy_mm_inputs)
**batched_dummy_mm_inputs
)
torch_xla.sync(wait=False) torch_xla.sync(wait=False)
num_patches = mm_embeds[0].shape[0] num_patches = mm_embeds[0].shape[0]
items_size = num_patches * num_items items_size = num_patches * num_items
# NOTE (NickLucche) pre-compile `get_input_embeddings` when mm # NOTE (NickLucche) pre-compile `embed_input_ids` when mm
# embeddings are present. We assume `--disable-mm-chunked`, # embeddings are present. We assume `--disable-mm-chunked`,
# hence only whole items can be scheduled. This implies we just # hence only whole items can be scheduled. This implies we just
# need to compile when `num_items` fit the (padded) `input_ids` # need to compile when `num_items` fit the (padded) `input_ids`
...@@ -1519,7 +1517,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -1519,7 +1517,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
assert a is None assert a is None
torch_xla.sync(wait=False) torch_xla.sync(wait=False)
# Pre-compile `get_input_embeddings` when mm_embeddings are not # Pre-compile `embed_input_ids` when mm_embeddings are not
# present. Chunk is only made of text, no mm_placeholders. # present. Chunk is only made of text, no mm_placeholders.
for num_tokens in self.num_tokens_paddings: for num_tokens in self.num_tokens_paddings:
placeholders_ids = torch.zeros( placeholders_ids = torch.zeros(
...@@ -1738,7 +1736,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -1738,7 +1736,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# impact of recompilation until it's fixed. # impact of recompilation until it's fixed.
start = time.perf_counter() start = time.perf_counter()
torch_xla.sync(wait=False) torch_xla.sync(wait=False)
dummy_encoder_outputs = self.model.get_multimodal_embeddings( dummy_encoder_outputs = self.model.embed_multimodal(
**batched_dummy_mm_inputs **batched_dummy_mm_inputs
) )
torch_xla.sync(wait=False) torch_xla.sync(wait=False)
...@@ -1974,11 +1972,11 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -1974,11 +1972,11 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
) )
return logits_cloned return logits_cloned
def get_multimodal_embeddings(self, *args, **kwargs): def embed_multimodal(self, *args, **kwargs):
return self.model.get_multimodal_embeddings(*args, **kwargs) return self.model.embed_multimodal(*args, **kwargs)
def get_input_embeddings(self, *args, **kwargs): def embed_input_ids(self, *args, **kwargs):
return self.model.get_input_embeddings(*args, **kwargs) return self.model.embed_input_ids(*args, **kwargs)
def prepare_structured_decoding_input( def prepare_structured_decoding_input(
self, logits: torch.Tensor, grammar_output: "GrammarOutput" self, logits: torch.Tensor, grammar_output: "GrammarOutput"
......
...@@ -177,27 +177,27 @@ def sanity_check_mm_encoder_outputs( ...@@ -177,27 +177,27 @@ def sanity_check_mm_encoder_outputs(
) -> None: ) -> None:
""" """
Perform sanity checks for the result of Perform sanity checks for the result of
[`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`][]. [`vllm.model_executor.models.SupportsMultiModal.embed_multimodal`][].
""" """
assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), ( assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), (
"Expected multimodal embeddings to be a list/tuple of 2D tensors, " "Expected multimodal embeddings to be a list/tuple of 2D tensors, "
f"or a single 3D tensor, but got {type(mm_embeddings)} " f"or a single 3D tensor, but got {type(mm_embeddings)} "
"instead. This is most likely due to incorrect implementation " "instead. This is most likely due to incorrect implementation "
"of the model's `get_multimodal_embeddings` method." "of the model's `embed_multimodal` method."
) )
assert len(mm_embeddings) == expected_num_items, ( assert len(mm_embeddings) == expected_num_items, (
"Expected number of multimodal embeddings to match number of " "Expected number of multimodal embeddings to match number of "
f"input items: {expected_num_items}, but got {len(mm_embeddings)=} " f"input items: {expected_num_items}, but got {len(mm_embeddings)=} "
"instead. This is most likely due to incorrect implementation " "instead. This is most likely due to incorrect implementation "
"of the model's `get_multimodal_embeddings` method." "of the model's `embed_multimodal` method."
) )
assert all(e.ndim == 2 for e in mm_embeddings), ( assert all(e.ndim == 2 for e in mm_embeddings), (
"Expected multimodal embeddings to be a sequence of 2D tensors, " "Expected multimodal embeddings to be a sequence of 2D tensors, "
f"but got tensors with shapes {[e.shape for e in mm_embeddings]} " f"but got tensors with shapes {[e.shape for e in mm_embeddings]} "
"instead. This is most likely due to incorrect implementation " "instead. This is most likely due to incorrect implementation "
"of the model's `get_multimodal_embeddings` method." "of the model's `embed_multimodal` method."
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment