Unverified Commit 97d1c993 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Rename clashing method names for vLLM model protocol (#27583)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 32262834
......@@ -279,7 +279,7 @@ class EagleProposer:
if self.supports_mm_inputs:
mm_embeds, is_mm_embed = mm_embed_inputs or (None, None)
self.inputs_embeds[:num_tokens] = self.model.get_input_embeddings(
self.inputs_embeds[:num_tokens] = self.model.embed_input_ids(
self.input_ids[:num_tokens],
multimodal_embeddings=mm_embeds,
is_multimodal=is_mm_embed,
......@@ -447,9 +447,7 @@ class EagleProposer:
self._set_positions(batch_size, clamped_positions)
self.hidden_states[:batch_size] = hidden_states
if self.supports_mm_inputs:
self.inputs_embeds[:batch_size] = self.model.get_input_embeddings(
input_ids
)
self.inputs_embeds[:batch_size] = self.model.embed_input_ids(input_ids)
input_ids = None
inputs_embeds = self.inputs_embeds[:input_batch_size]
......@@ -972,9 +970,7 @@ class EagleProposer:
# text-only draft models
try:
dummy_input_ids = torch.tensor([[1]], device=self.input_ids.device)
self.model.get_input_embeddings(
dummy_input_ids, multimodal_embeddings=None
)
self.model.embed_input_ids(dummy_input_ids, multimodal_embeddings=None)
except (NotImplementedError, AttributeError, TypeError):
logger.warning(
"Draft model does not support multimodal inputs, "
......
......@@ -1853,7 +1853,7 @@ class GPUModelRunner(
)
)
micro_batch_outputs = model.get_multimodal_embeddings(
micro_batch_outputs = model.embed_multimodal(
**micro_batch_mm_inputs
)
......@@ -1866,7 +1866,7 @@ class GPUModelRunner(
# 2. A list or tuple (length: num_items) of tensors,
# each of shape (feature_size, hidden_size) in case the feature
# size is dynamic depending on the input multimodal items.
curr_group_outputs = model.get_multimodal_embeddings(**mm_kwargs_group)
curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
sanity_check_mm_encoder_outputs(
curr_group_outputs,
......@@ -2225,7 +2225,7 @@ class GPUModelRunner(
# NOTE(woosuk): To unify token ids and soft tokens (vision
# embeddings), we always use embeddings (rather than token ids)
# as input to the multimodal model, even when the input is text.
inputs_embeds_scheduled = self.model.get_input_embeddings(
inputs_embeds_scheduled = self.model.embed_input_ids(
self.input_ids.gpu[:num_scheduled_tokens],
multimodal_embeddings=mm_embeds,
is_multimodal=is_mm_embed,
......@@ -2261,7 +2261,7 @@ class GPUModelRunner(
# Some tokens ids may need to become embeds
if token_ids_idx.numel() > 0:
token_ids = self.input_ids.gpu[token_ids_idx]
tokens_to_embeds = self.model.get_input_embeddings(input_ids=token_ids)
tokens_to_embeds = self.model.embed_input_ids(input_ids=token_ids)
self.inputs_embeds.gpu[token_ids_idx] = tokens_to_embeds
inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens]
......@@ -3889,7 +3889,7 @@ class GPUModelRunner(
)
# Run multimodal encoder.
dummy_encoder_outputs = self.model.get_multimodal_embeddings(
dummy_encoder_outputs = self.model.embed_multimodal(
**batched_dummy_mm_inputs
)
......
......@@ -962,7 +962,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# (feature_size, hidden_size) in case the feature size is dynamic
# depending on the input multimodal items.
torch_xla.sync(wait=False)
curr_group_outputs = model.get_multimodal_embeddings(**mm_kwargs_group)
curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
torch_xla.sync(wait=False)
sanity_check_mm_encoder_outputs(
......@@ -1065,7 +1065,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# NOTE(woosuk): To unify token ids and soft tokens (vision
# embeddings), we always use embeddings (rather than token ids)
# as input to the multimodal model, even when the input is text.
inputs_embeds = self.model.get_input_embeddings(
inputs_embeds = self.model.embed_input_ids(
input_ids,
multimodal_embeddings=mm_embeds,
is_multimodal=is_mm_embed,
......@@ -1484,14 +1484,12 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
)
# Run multimodal encoder.
torch_xla.sync(wait=False)
mm_embeds = self.model.get_multimodal_embeddings(
**batched_dummy_mm_inputs
)
mm_embeds = self.model.embed_multimodal(**batched_dummy_mm_inputs)
torch_xla.sync(wait=False)
num_patches = mm_embeds[0].shape[0]
items_size = num_patches * num_items
# NOTE (NickLucche) pre-compile `get_input_embeddings` when mm
# NOTE (NickLucche) pre-compile `embed_input_ids` when mm
# embeddings are present. We assume `--disable-mm-chunked`,
# hence only whole items can be scheduled. This implies we just
# need to compile when `num_items` fit the (padded) `input_ids`
......@@ -1519,7 +1517,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
assert a is None
torch_xla.sync(wait=False)
# Pre-compile `get_input_embeddings` when mm_embeddings are not
# Pre-compile `embed_input_ids` when mm_embeddings are not
# present. Chunk is only made of text, no mm_placeholders.
for num_tokens in self.num_tokens_paddings:
placeholders_ids = torch.zeros(
......@@ -1738,7 +1736,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# impact of recompilation until it's fixed.
start = time.perf_counter()
torch_xla.sync(wait=False)
dummy_encoder_outputs = self.model.get_multimodal_embeddings(
dummy_encoder_outputs = self.model.embed_multimodal(
**batched_dummy_mm_inputs
)
torch_xla.sync(wait=False)
......@@ -1974,11 +1972,11 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
)
return logits_cloned
def get_multimodal_embeddings(self, *args, **kwargs):
return self.model.get_multimodal_embeddings(*args, **kwargs)
def embed_multimodal(self, *args, **kwargs):
return self.model.embed_multimodal(*args, **kwargs)
def get_input_embeddings(self, *args, **kwargs):
return self.model.get_input_embeddings(*args, **kwargs)
def embed_input_ids(self, *args, **kwargs):
return self.model.embed_input_ids(*args, **kwargs)
def prepare_structured_decoding_input(
self, logits: torch.Tensor, grammar_output: "GrammarOutput"
......
......@@ -177,27 +177,27 @@ def sanity_check_mm_encoder_outputs(
) -> None:
"""
Perform sanity checks for the result of
[`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`][].
[`vllm.model_executor.models.SupportsMultiModal.embed_multimodal`][].
"""
assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), (
"Expected multimodal embeddings to be a list/tuple of 2D tensors, "
f"or a single 3D tensor, but got {type(mm_embeddings)} "
"instead. This is most likely due to incorrect implementation "
"of the model's `get_multimodal_embeddings` method."
"of the model's `embed_multimodal` method."
)
assert len(mm_embeddings) == expected_num_items, (
"Expected number of multimodal embeddings to match number of "
f"input items: {expected_num_items}, but got {len(mm_embeddings)=} "
"instead. This is most likely due to incorrect implementation "
"of the model's `get_multimodal_embeddings` method."
"of the model's `embed_multimodal` method."
)
assert all(e.ndim == 2 for e in mm_embeddings), (
"Expected multimodal embeddings to be a sequence of 2D tensors, "
f"but got tensors with shapes {[e.shape for e in mm_embeddings]} "
"instead. This is most likely due to incorrect implementation "
"of the model's `get_multimodal_embeddings` method."
"of the model's `embed_multimodal` method."
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment