Unverified Commit 64bc09ba authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Core] Enable `inputs_embeds_size` separate from `hidden_size` (#29741)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 47539cfd
...@@ -19,7 +19,12 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts( ...@@ -19,7 +19,12 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
} }
) )
MODELS = ["google/siglip-base-patch16-224", "google/siglip2-base-patch16-224"] MODELS = [
"google/siglip-base-patch16-224",
"google/siglip2-base-patch16-224",
# Different image embedding dim than text_config.hidden_size
"google/siglip2-giant-opt-patch16-384",
]
def _run_test( def _run_test(
......
...@@ -1202,6 +1202,16 @@ class ModelConfig: ...@@ -1202,6 +1202,16 @@ class ModelConfig:
def get_hidden_size(self) -> int: def get_hidden_size(self) -> int:
return getattr(self.hf_text_config, "hidden_size", 0) return getattr(self.hf_text_config, "hidden_size", 0)
def get_inputs_embeds_size(self) -> int:
# The size of inputs_embeds is usually identical to the size
# of the hidden states, however there are exceptions, such as
# embedding models like CLIP and SigLIP
for target_attr in ("projection_dim", "projection_size"):
if hasattr(self.hf_text_config, target_attr):
return getattr(self.hf_text_config, target_attr)
return self.get_hidden_size()
@property @property
def is_deepseek_mla(self) -> bool: def is_deepseek_mla(self) -> bool:
if not hasattr(self.hf_text_config, "model_type"): if not hasattr(self.hf_text_config, "model_type"):
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable, Mapping, Sequence from collections.abc import Callable, Iterable, Mapping, Sequence
from functools import cached_property from functools import cached_property
from typing import Annotated, Literal from typing import Annotated, Literal
...@@ -903,6 +903,41 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant): ...@@ -903,6 +903,41 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
def get_language_model(self) -> torch.nn.Module: def get_language_model(self) -> torch.nn.Module:
return self.text_model return self.text_model
def _embed_text_input_ids(
self,
input_ids: torch.Tensor,
embed_input_ids: Callable[[torch.Tensor], torch.Tensor],
*,
is_multimodal: torch.Tensor | None,
handle_oov_mm_token: bool,
) -> torch.Tensor:
inputs_embeds = super()._embed_text_input_ids(
input_ids,
embed_input_ids,
is_multimodal=is_multimodal,
handle_oov_mm_token=handle_oov_mm_token,
)
# NOTE: inputs_embeds in model runner has size text_config.projection_dim
# (instead of text_config.hidden_size) to accommodate image embeddings
inputs_embeds_size = self.projection_dim
if inputs_embeds.shape[1] < inputs_embeds_size:
inputs_embeds = torch.cat(
[
inputs_embeds,
inputs_embeds.new_empty(
inputs_embeds.shape[0],
inputs_embeds_size - inputs_embeds.shape[1],
),
],
dim=1,
)
elif inputs_embeds.shape[1] > inputs_embeds_size:
# No need to handle this case for now
raise NotImplementedError
return inputs_embeds
def embed_input_ids( def embed_input_ids(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
...@@ -949,10 +984,16 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant): ...@@ -949,10 +984,16 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
if not self._is_text_input: if not self._is_text_input:
return inputs_embeds return inputs_embeds
# Text inputs # NOTE: inputs_embeds in model runner has size text_config.projection_dim
return self.get_text_features( # (instead of text_config.hidden_size) to accommodate image embeddings
input_ids=input_ids, position_ids=positions, inputs_embeds=inputs_embeds hidden_size = self.text_embed_dim
) if inputs_embeds.shape[1] > hidden_size:
inputs_embeds = inputs_embeds[:, :hidden_size]
elif inputs_embeds.shape[1] < hidden_size:
# No need to handle this case for now
raise NotImplementedError
return self.get_text_features(input_ids, positions, inputs_embeds)
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
loader = AutoWeightsLoader( loader = AutoWeightsLoader(
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Implementation of SiglipVisionModel intended to be only used
within a vision language model."""
import math import math
from collections.abc import Iterable, Mapping from collections.abc import Callable, Iterable, Mapping
from functools import cached_property from functools import cached_property
from typing import Annotated, Literal from typing import Annotated, Literal
...@@ -976,6 +974,7 @@ class SiglipTextEmbeddings(nn.Module): ...@@ -976,6 +974,7 @@ class SiglipTextEmbeddings(nn.Module):
position_embeddings = self.position_embedding(position_ids) position_embeddings = self.position_embedding(position_ids)
embeddings = inputs_embeds + position_embeddings embeddings = inputs_embeds + position_embeddings
return embeddings return embeddings
...@@ -1145,6 +1144,41 @@ class SiglipEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant): ...@@ -1145,6 +1144,41 @@ class SiglipEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
def get_language_model(self) -> torch.nn.Module: def get_language_model(self) -> torch.nn.Module:
return self.text_model return self.text_model
def _embed_text_input_ids(
self,
input_ids: torch.Tensor,
embed_input_ids: Callable[[torch.Tensor], torch.Tensor],
*,
is_multimodal: torch.Tensor | None,
handle_oov_mm_token: bool,
) -> torch.Tensor:
inputs_embeds = super()._embed_text_input_ids(
input_ids,
embed_input_ids,
is_multimodal=is_multimodal,
handle_oov_mm_token=handle_oov_mm_token,
)
# NOTE: inputs_embeds in model runner has size text_config.projection_size
# (instead of text_config.hidden_size) to accommodate image embeddings
inputs_embeds_size = self.text_projection_size
if inputs_embeds.shape[1] < inputs_embeds_size:
inputs_embeds = torch.cat(
[
inputs_embeds,
inputs_embeds.new_empty(
inputs_embeds.shape[0],
inputs_embeds_size - inputs_embeds.shape[1],
),
],
dim=1,
)
elif inputs_embeds.shape[1] > inputs_embeds_size:
# No need to handle this case for now
raise NotImplementedError
return inputs_embeds
def embed_input_ids( def embed_input_ids(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
...@@ -1190,6 +1224,15 @@ class SiglipEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant): ...@@ -1190,6 +1224,15 @@ class SiglipEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
if not self._is_text_input: if not self._is_text_input:
return inputs_embeds return inputs_embeds
# NOTE: inputs_embeds in model runner has size text_config.projection_size
# (instead of text_config.hidden_size) to accommodate image embeddings
hidden_size = self.text_embed_dim
if inputs_embeds.shape[1] > hidden_size:
inputs_embeds = inputs_embeds[:, :hidden_size]
elif inputs_embeds.shape[1] < hidden_size:
# No need to handle this case for now
raise NotImplementedError
return self.get_text_features(input_ids, positions, inputs_embeds) return self.get_text_features(input_ids, positions, inputs_embeds)
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
......
...@@ -80,6 +80,7 @@ class EagleProposer: ...@@ -80,6 +80,7 @@ class EagleProposer:
# the draft model's hidden size can be different from the target model's # the draft model's hidden size can be different from the target model's
# hidden size (e.g., Llama 3.3 70B). # hidden size (e.g., Llama 3.3 70B).
self.hidden_size = self.draft_model_config.get_hidden_size() self.hidden_size = self.draft_model_config.get_hidden_size()
self.inputs_embeds_size = self.draft_model_config.get_inputs_embeds_size()
# Multi-modal data support # Multi-modal data support
self.mm_registry = MULTIMODAL_REGISTRY self.mm_registry = MULTIMODAL_REGISTRY
...@@ -151,7 +152,9 @@ class EagleProposer: ...@@ -151,7 +152,9 @@ class EagleProposer:
) )
self.inputs_embeds = torch.zeros( self.inputs_embeds = torch.zeros(
(self.max_num_tokens, self.hidden_size), dtype=self.dtype, device=device (self.max_num_tokens, self.inputs_embeds_size),
dtype=self.dtype,
device=device,
) )
self.backup_next_token_ids = CpuGpuBuffer( self.backup_next_token_ids = CpuGpuBuffer(
......
...@@ -17,7 +17,7 @@ class InputBuffers: ...@@ -17,7 +17,7 @@ class InputBuffers:
self, self,
max_num_reqs: int, max_num_reqs: int,
max_num_tokens: int, max_num_tokens: int,
hidden_size: int, inputs_embeds_size: int,
vocab_size: int, vocab_size: int,
dtype: torch.dtype, dtype: torch.dtype,
device: torch.device, device: torch.device,
......
...@@ -98,7 +98,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -98,7 +98,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.max_model_len = self.model_config.max_model_len self.max_model_len = self.model_config.max_model_len
self.max_num_tokens = self.scheduler_config.max_num_batched_tokens self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
self.max_num_reqs = self.scheduler_config.max_num_seqs self.max_num_reqs = self.scheduler_config.max_num_seqs
self.hidden_size = self.model_config.get_hidden_size() self.inputs_embeds_size = self.model_config.get_inputs_embeds_size()
self.dp_size = self.parallel_config.data_parallel_size self.dp_size = self.parallel_config.data_parallel_size
self.dp_rank = self.parallel_config.data_parallel_rank self.dp_rank = self.parallel_config.data_parallel_rank
...@@ -134,7 +134,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -134,7 +134,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.input_buffers = InputBuffers( self.input_buffers = InputBuffers(
max_num_reqs=self.max_num_reqs, max_num_reqs=self.max_num_reqs,
max_num_tokens=self.max_num_tokens, max_num_tokens=self.max_num_tokens,
hidden_size=self.hidden_size, inputs_embeds_size=self.inputs_embeds_size,
vocab_size=self.vocab_size, vocab_size=self.vocab_size,
dtype=self.dtype, dtype=self.dtype,
device=self.device, device=self.device,
......
...@@ -44,6 +44,7 @@ class EagleSpeculator: ...@@ -44,6 +44,7 @@ class EagleSpeculator:
# the draft model's hidden size can be different from the target model's # the draft model's hidden size can be different from the target model's
# hidden size (e.g., Llama 3.3 70B). # hidden size (e.g., Llama 3.3 70B).
self.hidden_size = self.draft_model_config.get_hidden_size() self.hidden_size = self.draft_model_config.get_hidden_size()
self.inputs_embeds_size = self.draft_model_config.get_inputs_embeds_size()
self.vocab_size = self.draft_model_config.get_vocab_size() self.vocab_size = self.draft_model_config.get_vocab_size()
self.pin_memory = is_pin_memory_available() self.pin_memory = is_pin_memory_available()
self.dtype = vllm_config.model_config.dtype self.dtype = vllm_config.model_config.dtype
...@@ -51,7 +52,7 @@ class EagleSpeculator: ...@@ -51,7 +52,7 @@ class EagleSpeculator:
self.input_buffers = InputBuffers( self.input_buffers = InputBuffers(
max_num_reqs=self.max_num_reqs, max_num_reqs=self.max_num_reqs,
max_num_tokens=self.max_num_tokens, max_num_tokens=self.max_num_tokens,
hidden_size=self.hidden_size, inputs_embeds_size=self.inputs_embeds_size,
vocab_size=self.vocab_size, vocab_size=self.vocab_size,
dtype=self.dtype, dtype=self.dtype,
device=device, device=device,
......
...@@ -320,7 +320,7 @@ class GPUModelRunner( ...@@ -320,7 +320,7 @@ class GPUModelRunner(
# Model-related. # Model-related.
self.num_query_heads = model_config.get_num_attention_heads(parallel_config) self.num_query_heads = model_config.get_num_attention_heads(parallel_config)
self.hidden_size = model_config.get_hidden_size() self.inputs_embeds_size = model_config.get_inputs_embeds_size()
self.attention_chunk_size = model_config.attention_chunk_size self.attention_chunk_size = model_config.attention_chunk_size
# Only relevant for models using ALiBi (e.g, MPT) # Only relevant for models using ALiBi (e.g, MPT)
self.use_alibi = model_config.uses_alibi self.use_alibi = model_config.uses_alibi
...@@ -485,7 +485,7 @@ class GPUModelRunner( ...@@ -485,7 +485,7 @@ class GPUModelRunner(
# version of this tensor, avoid a RuntimeError by not creating a # version of this tensor, avoid a RuntimeError by not creating a
# numpy buffer. # numpy buffer.
self.inputs_embeds = self._make_buffer( self.inputs_embeds = self._make_buffer(
self.max_num_tokens, self.hidden_size, dtype=self.dtype, numpy=False self.max_num_tokens, self.inputs_embeds_size, dtype=self.dtype, numpy=False
) )
self.is_token_ids = self._make_buffer(self.max_num_tokens, dtype=torch.bool) self.is_token_ids = self._make_buffer(self.max_num_tokens, dtype=torch.bool)
self.discard_request_mask = self._make_buffer( self.discard_request_mask = self._make_buffer(
......
...@@ -215,7 +215,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -215,7 +215,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.num_query_heads = model_config.get_num_attention_heads(parallel_config) self.num_query_heads = model_config.get_num_attention_heads(parallel_config)
self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
self.head_size = model_config.get_head_size() self.head_size = model_config.get_head_size()
self.hidden_size = model_config.get_hidden_size() self.inputs_embeds_size = model_config.get_inputs_embeds_size()
self.vocab_size = model_config.get_vocab_size() self.vocab_size = model_config.get_vocab_size()
# Multi-modal data support # Multi-modal data support
...@@ -1406,7 +1406,9 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -1406,7 +1406,9 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
if self.supports_mm_inputs: if self.supports_mm_inputs:
input_ids = None input_ids = None
inputs_embeds = torch.zeros( inputs_embeds = torch.zeros(
(num_tokens, self.hidden_size), dtype=self.dtype, device=self.device (num_tokens, self.inputs_embeds_size),
dtype=self.dtype,
device=self.device,
) )
else: else:
input_ids = torch.zeros((num_tokens), dtype=torch.int32).to(self.device) input_ids = torch.zeros((num_tokens), dtype=torch.int32).to(self.device)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment