Unverified Commit c46b932d authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Chore] Deprecate `SupportsMultiModal.merge_by_field_config` (#30170)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 64763823
......@@ -499,8 +499,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
model to perform tasks that involve both image and text inputs.
"""
merge_by_field_config = True
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={
# mapping for new names in checkpoint saved after transformers v4.52
......
......@@ -318,8 +318,6 @@ def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
dummy_inputs=AyaVisionDummyInputsBuilder,
)
class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
merge_by_field_config = True
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={
# mapping for new names in checkpoint saved after transformers v4.52
......
......@@ -523,8 +523,6 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
class Blip2ForConditionalGeneration(
nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant
):
merge_by_field_config = True
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
if modality.startswith("image"):
......
......@@ -918,8 +918,6 @@ class ChameleonModel(nn.Module):
class ChameleonForConditionalGeneration(
nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant
):
merge_by_field_config = True
packed_modules_mapping = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"],
......
......@@ -784,7 +784,6 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
is_pooling_model = True
packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
merge_by_field_config = True
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> str | None:
......
......@@ -331,8 +331,6 @@ class Cohere2VisionMultiModalProcessor(
dummy_inputs=Cohere2VisionDummyInputsBuilder,
)
class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
merge_by_field_config = True
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={
"model.vision_tower.": "vision_tower.",
......
......@@ -344,8 +344,6 @@ class DeepseekOCRMultiModalProcessor(
dummy_inputs=DeepseekOCRDummyInputsBuilder,
)
class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
merge_by_field_config = True
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={
# map prefix for language backbone
......
......@@ -344,8 +344,6 @@ class DeepseekVL2MultiModalProcessor(
dummy_inputs=DeepseekVL2DummyInputsBuilder,
)
class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
merge_by_field_config = True
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={
"language.": "language_model.",
......
......@@ -690,8 +690,6 @@ class DotsVisionTransformer(nn.Module):
dummy_inputs=DotsOCRDummyInputsBuilder,
)
class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
merge_by_field_config = True
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_substr={
".attn.qkv_proj.": ".attn.qkv.",
......
......@@ -1254,8 +1254,6 @@ class Ernie4_5_VLDummyInputsBuilder(BaseDummyInputsBuilder[Ernie4_5_VLProcessing
class Ernie4_5_VLMoeForConditionalGeneration(
nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
):
merge_by_field_config = True
packed_modules_mapping = {
"qkv_proj": [
"q_proj",
......
......@@ -260,8 +260,6 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
dummy_inputs=FuyuDummyInputsBuilder,
)
class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
merge_by_field_config = True
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={
"model.vision_embed_tokens.": "vision_embed_tokens.",
......
......@@ -483,8 +483,6 @@ class Gemma3MultiModalProjector(nn.Module):
class Gemma3ForConditionalGeneration(
nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
):
merge_by_field_config = True
packed_modules_mapping = {
"qkv_proj": [
"q_proj",
......
......@@ -463,7 +463,6 @@ class Gemma3nMultimodalEmbedder(nn.Module):
class Gemma3nForConditionalGeneration(
nn.Module, SupportsMultiModal, SupportsTranscription
):
merge_by_field_config = True
supported_languages = ISO639_1_SUPPORTED_LANGS
packed_modules_mapping = {
......
......@@ -1424,8 +1424,6 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
class Glm4vForConditionalGeneration(
nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
):
merge_by_field_config = True
packed_modules_mapping = {
"qkv_proj": [
"q_proj",
......
......@@ -561,8 +561,6 @@ class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
class GLM4VForCausalLM(
ChatGLMBaseModel, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
):
merge_by_field_config = True
packed_modules_mapping = {
"query_key_value": ["query_key_value"],
"dense_h_to_4h": ["dense_h_to_4h"],
......
......@@ -564,7 +564,6 @@ class GraniteSpeechForConditionalGeneration(
SupportsLoRA,
SupportsTranscription,
):
merge_by_field_config = True
supported_languages = ISO639_1_SUPPORTED_LANGS
packed_modules_mapping = {
......
......@@ -786,7 +786,6 @@ class HunYuanVLForConditionalGeneration(
SupportsQuant,
SupportsXDRoPE,
):
merge_by_field_config = True
multimodal_cpu_fields = {"image_grid_thw"}
# To ensure correct weight loading and mapping.
......
......@@ -592,8 +592,6 @@ class HCXVisionCAbstractor(nn.Module):
dummy_inputs=HCXVisionDummyInputsBuilder,
)
class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
merge_by_field_config = True
packed_modules_mapping = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"],
......
......@@ -576,8 +576,6 @@ class Idefics3Model(nn.Module):
dummy_inputs=Idefics3DummyInputsBuilder,
)
class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA):
merge_by_field_config = True
packed_modules_mapping = {
"qkv_proj": [
"q_proj",
......
......@@ -78,9 +78,9 @@ class SupportsMultiModal(Protocol):
`multimodal_config.mm_encoder_tp_mode="data"`.
"""
merge_by_field_config: ClassVar[bool] = True
merge_by_field_config: ClassVar[bool | None] = None
"""
A flag that indicates which implementation of
[DEPRECATED] A flag that indicates which implementation of
`vllm.multimodal.utils.group_mm_kwargs_by_modality` to use.
"""
......@@ -260,7 +260,26 @@ def supports_multimodal(model: object) -> TypeIs[SupportsMultiModal]: ...
def supports_multimodal(
model: type[object] | object,
) -> TypeIs[type[SupportsMultiModal]] | TypeIs[SupportsMultiModal]:
return getattr(model, "supports_multimodal", False)
res = getattr(model, "supports_multimodal", False)
if res:
# We can remove this starting from v0.14
merge_by_field_config = getattr(model, "merge_by_field_config", None)
if merge_by_field_config is False:
raise ValueError(
"`merge_by_field_config=False` is no longer effective, "
"please update your model to consider the new batching logic "
"in `group_mm_kwargs_by_modality` (refer to "
"https://github.com/vllm-project/vllm/issues/26149), "
"and then remove the override from your model."
)
if merge_by_field_config is True:
logger.warning_once(
"`merge_by_field_config=True` is redundant, "
"please remove the override from your model."
)
return res
def supports_multimodal_raw_input_only(model: type[object] | object) -> bool:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment