Commit 8d75f22e authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.13.0rc1' into v0.13.0rc1-ori

parents ce888aa4 7d80c73d
......@@ -89,16 +89,14 @@ class GPTNeoXAttention(nn.Module):
quant_config=quant_config,
prefix=f"{prefix}.dense",
)
scaling = self.head_size**-0.5
rotary_dim = int(self.head_size * config.rotary_pct)
assert rotary_dim % 2 == 0
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
self.rotary_emb = get_rope(
self.head_size,
rotary_dim=rotary_dim,
rotary_dim=self.head_size,
max_position=max_position_embeddings,
rope_parameters=config.rope_parameters,
)
scaling = self.head_size**-0.5
self.attn = Attention(
self.num_heads,
self.head_size,
......
......@@ -564,7 +564,6 @@ class GraniteSpeechForConditionalGeneration(
SupportsLoRA,
SupportsTranscription,
):
merge_by_field_config = True
supported_languages = ISO639_1_SUPPORTED_LANGS
packed_modules_mapping = {
......
......@@ -14,8 +14,6 @@ from vllm.model_executor.layers.pooler import (
PoolerHead,
PoolerNormalize,
PoolingParamsUpdate,
get_prompt_lens,
get_prompt_token_ids,
)
from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.tasks import PoolingTask
......@@ -153,11 +151,11 @@ class GritLMMeanPool(nn.Module):
hidden_states: torch.Tensor | list[torch.Tensor],
pooling_metadata: PoolingMetadata,
) -> list[torch.Tensor] | torch.Tensor:
prompt_lens = get_prompt_lens(hidden_states, pooling_metadata)
prompt_lens = pooling_metadata.prompt_lens
instr_lens = torch.tensor(
[
self._get_instruction_len(token_ids.cpu().numpy())
for token_ids in get_prompt_token_ids(pooling_metadata)
for token_ids in pooling_metadata.get_prompt_token_ids()
],
device="cpu",
)
......
......@@ -62,6 +62,7 @@ from vllm.multimodal.inputs import (
from vllm.multimodal.parse import (
DictEmbeddingItems,
ImageSize,
ModalityDataItems,
MultiModalDataItems,
MultiModalDataParser,
)
......@@ -562,7 +563,7 @@ def _hunyuan_vl_field_config(hf_inputs: Mapping[str, torch.Tensor]):
return dict(
pixel_values=MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes),
image_embeds=MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes),
image_grid_thw=MultiModalFieldConfig.batched("image"),
image_grid_thw=MultiModalFieldConfig.batched("image", keep_on_cpu=True),
)
......@@ -570,7 +571,7 @@ class HunYuanVLMultiModalDataParser(MultiModalDataParser):
def _parse_image_data(
self,
data: dict[str, torch.Tensor] | ModalityData[ImageItem],
):
) -> ModalityDataItems[Any, Any] | None:
if isinstance(data, dict):
return DictEmbeddingItems(
data,
......@@ -785,8 +786,6 @@ class HunYuanVLForConditionalGeneration(
SupportsQuant,
SupportsXDRoPE,
):
multimodal_cpu_fields = {"image_grid_thw"}
# To ensure correct weight loading and mapping.
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={
......
......@@ -592,8 +592,6 @@ class HCXVisionCAbstractor(nn.Module):
dummy_inputs=HCXVisionDummyInputsBuilder,
)
class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
merge_by_field_config = True
packed_modules_mapping = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"],
......
......@@ -576,8 +576,6 @@ class Idefics3Model(nn.Module):
dummy_inputs=Idefics3DummyInputsBuilder,
)
class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA):
merge_by_field_config = True
packed_modules_mapping = {
"qkv_proj": [
"q_proj",
......
......@@ -78,15 +78,15 @@ class SupportsMultiModal(Protocol):
`multimodal_config.mm_encoder_tp_mode="data"`.
"""
merge_by_field_config: ClassVar[bool] = False
merge_by_field_config: ClassVar[bool | None] = None
"""
A flag that indicates which implementation of
[DEPRECATED] A flag that indicates which implementation of
`vllm.multimodal.utils.group_mm_kwargs_by_modality` to use.
"""
multimodal_cpu_fields: ClassVar[Set[str]] = frozenset()
multimodal_cpu_fields: ClassVar[Set[str] | None] = None
"""
A set indicating CPU-only multimodal fields.
[DEPRECATED] A set indicating CPU-only multimodal fields.
"""
_processor_factory: ClassVar[_ProcessorFactories]
......@@ -260,7 +260,35 @@ def supports_multimodal(model: object) -> TypeIs[SupportsMultiModal]: ...
def supports_multimodal(
model: type[object] | object,
) -> TypeIs[type[SupportsMultiModal]] | TypeIs[SupportsMultiModal]:
return getattr(model, "supports_multimodal", False)
res = getattr(model, "supports_multimodal", False)
if res:
# We can remove this starting from v0.14
merge_by_field_config = getattr(model, "merge_by_field_config", None)
if merge_by_field_config is False:
raise ValueError(
"`merge_by_field_config=False` is no longer effective, "
"please update your model to consider the new batching logic "
"in `group_mm_kwargs_by_modality` (refer to "
"https://github.com/vllm-project/vllm/issues/26149), "
"and then remove the override from your model."
)
if merge_by_field_config is True:
logger.warning_once(
"`merge_by_field_config=True` is redundant, "
"please remove the override from your model."
)
multimodal_cpu_fields = getattr(model, "multimodal_cpu_fields", None)
if multimodal_cpu_fields is not None:
raise ValueError(
"`multimodal_cpu_fields` is no longer effective, "
"please set `keep_on_cpu=True` in `MultiModalFieldConfig` "
"(refer to https://github.com/vllm-project/vllm/pull/30181), "
"and then remove the override from your model."
)
return res
def supports_multimodal_raw_input_only(model: type[object] | object) -> bool:
......
......@@ -509,8 +509,6 @@ class InternS1MultiModalProcessor(BaseMultiModalProcessor[InternS1ProcessingInfo
class InternS1ForConditionalGeneration(
nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
):
merge_by_field_config = True
# To ensure correct weight loading and mapping.
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={
......
......@@ -1074,8 +1074,6 @@ class InternVLMultiModalProcessor(
dummy_inputs=InternVLDummyInputsBuilder,
)
class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
merge_by_field_config = True
supports_encoder_tp_data = True
@classmethod
......
......@@ -1000,7 +1000,7 @@ class KeyeMultiModalDataParser(MultiModalDataParser):
def _parse_image_data(
self,
data: dict[str, torch.Tensor] | ModalityData[ImageItem],
) -> ModalityDataItems[Any, Any]:
) -> ModalityDataItems[Any, Any] | None:
if isinstance(data, dict):
return DictEmbeddingItems(
data,
......@@ -1017,7 +1017,7 @@ class KeyeMultiModalDataParser(MultiModalDataParser):
def _parse_video_data(
self,
data: dict[str, torch.Tensor] | ModalityData[VideoItem],
) -> ModalityDataItems[Any, Any]:
) -> ModalityDataItems[Any, Any] | None:
if isinstance(data, dict):
return DictEmbeddingItems(
data,
......@@ -1292,8 +1292,6 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
class BaseKeyeModule(nn.Module):
merge_by_field_config = True
packed_modules_mapping = {
"qkv_proj": [
"q_proj",
......
......@@ -333,7 +333,7 @@ class KeyeVL1_5MultiModalDataParser(MultiModalDataParser):
def _parse_image_data(
self,
data: dict[str, torch.Tensor] | ModalityData[ImageItem],
) -> ModalityDataItems[Any, Any]:
) -> ModalityDataItems[Any, Any] | None:
if isinstance(data, dict):
return DictEmbeddingItems(
data,
......@@ -350,7 +350,7 @@ class KeyeVL1_5MultiModalDataParser(MultiModalDataParser):
def _parse_video_data(
self,
data: dict[str, torch.Tensor] | ModalityData[VideoItem],
) -> ModalityDataItems[Any, Any]:
) -> ModalityDataItems[Any, Any] | None:
if isinstance(data, dict):
return DictEmbeddingItems(
data,
......
......@@ -298,8 +298,6 @@ class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
dummy_inputs=KimiVLDummyInputsBuilder,
)
class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
merge_by_field_config = True
supports_encoder_tp_data = True
@classmethod
......
......@@ -28,7 +28,7 @@ from vllm.model_executor.models.utils import (
)
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.cache import BaseMultiModalProcessorCache
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems
from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
from vllm.multimodal.processing import (
BaseMultiModalProcessor,
......@@ -103,7 +103,7 @@ class LightOnOCRMultiModalProcessor(BaseMultiModalProcessor[Mistral3ProcessingIn
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config()
image_token_id = hf_config.image_token_index
......
......@@ -149,8 +149,6 @@ class LlamaAttention(nn.Module):
if head_dim is None:
head_dim = self.hidden_size // self.total_num_heads
self.head_dim = head_dim
# Phi models introduced a partial_rotary_factor parameter in the config
self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5
......@@ -265,7 +263,6 @@ class LlamaAttention(nn.Module):
max_position=self.max_position_embeddings,
rope_parameters=getattr(config, "rope_parameters", None),
is_neox_style=is_neox_style,
partial_rotary_factor=self.partial_rotary_factor,
)
......
......@@ -28,7 +28,10 @@ from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.quantization.torchao import TorchAOConfig
from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead,
VocabParallelEmbedding,
)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.llama4 import Llama4DecoderLayer, Llama4ForCausalLM
from vllm.model_executor.models.utils import extract_layer_index
......@@ -182,6 +185,12 @@ class EagleLlama4ForCausalLM(Llama4ForCausalLM):
self.config.vocab_size, scale=logit_scale
)
self.lm_head = ParallelLMHead(
self.config.draft_vocab_size,
self.config.hidden_size,
prefix=maybe_prefix(prefix, "lm_head"),
)
# Set MoE hyperparameters
self.set_moe_parameters()
......@@ -211,6 +220,6 @@ class EagleLlama4ForCausalLM(Llama4ForCausalLM):
loader = AutoWeightsLoader(
self,
# lm_head is tied with target model (Llama4ForCausalLM)
skip_prefixes=(["lm_head."]),
skip_prefixes=([]),
)
loader.load_weights(map(transform, weights))
......@@ -506,8 +506,6 @@ def init_vision_tower_for_llava(
dummy_inputs=LlavaDummyInputsBuilder,
)
class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
merge_by_field_config = True
packed_modules_mapping = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"],
......
......@@ -223,8 +223,6 @@ class LlavaNextMultiModalProcessor(
dummy_inputs=LlavaDummyInputsBuilder,
)
class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
merge_by_field_config = True
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={
# mapping for new names in checkpoint saved after transformers v4.52
......
......@@ -299,8 +299,6 @@ class LlavaNextMultiModalProjector(nn.Module):
dummy_inputs=LlavaNextVideoDummyInputsBuilder,
)
class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
merge_by_field_config = True
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={
# mapping for new names in checkpoint saved after transformers v4.52
......
......@@ -479,8 +479,6 @@ class LlavaOnevisionMultiModalProjector(nn.Module):
dummy_inputs=LlavaOnevisionDummyInputsBuilder,
)
class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
merge_by_field_config = True
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={
# mapping for new names in checkpoint saved after transformers v4.52
......
......@@ -683,8 +683,6 @@ class MiDashengLMMultiModalProcessor(
dummy_inputs=MiDashengLMDummyInputsBuilder,
)
class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP):
merge_by_field_config = True
packed_modules_mapping = {
"qkv_proj": [
"q_proj",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment