Unverified Commit c46b932d authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Chore] Deprecate `SupportsMultiModal.merge_by_field_config` (#30170)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 64763823
...@@ -499,8 +499,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): ...@@ -499,8 +499,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
model to perform tasks that involve both image and text inputs. model to perform tasks that involve both image and text inputs.
""" """
merge_by_field_config = True
hf_to_vllm_mapper = WeightsMapper( hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={ orig_to_new_prefix={
# mapping for new names in checkpoint saved after transformers v4.52 # mapping for new names in checkpoint saved after transformers v4.52
......
...@@ -318,8 +318,6 @@ def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int: ...@@ -318,8 +318,6 @@ def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
dummy_inputs=AyaVisionDummyInputsBuilder, dummy_inputs=AyaVisionDummyInputsBuilder,
) )
class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
merge_by_field_config = True
hf_to_vllm_mapper = WeightsMapper( hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={ orig_to_new_prefix={
# mapping for new names in checkpoint saved after transformers v4.52 # mapping for new names in checkpoint saved after transformers v4.52
......
...@@ -523,8 +523,6 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]): ...@@ -523,8 +523,6 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
class Blip2ForConditionalGeneration( class Blip2ForConditionalGeneration(
nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant
): ):
merge_by_field_config = True
@classmethod @classmethod
def get_placeholder_str(cls, modality: str, i: int) -> str | None: def get_placeholder_str(cls, modality: str, i: int) -> str | None:
if modality.startswith("image"): if modality.startswith("image"):
......
...@@ -918,8 +918,6 @@ class ChameleonModel(nn.Module): ...@@ -918,8 +918,6 @@ class ChameleonModel(nn.Module):
class ChameleonForConditionalGeneration( class ChameleonForConditionalGeneration(
nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant
): ):
merge_by_field_config = True
packed_modules_mapping = { packed_modules_mapping = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"], "qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"], "gate_up_proj": ["gate_proj", "up_proj"],
......
...@@ -784,7 +784,6 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant): ...@@ -784,7 +784,6 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
is_pooling_model = True is_pooling_model = True
packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]} packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
merge_by_field_config = True
@classmethod @classmethod
def get_placeholder_str(cls, modality: str, i: int) -> str | None: def get_placeholder_str(cls, modality: str, i: int) -> str | None:
......
...@@ -331,8 +331,6 @@ class Cohere2VisionMultiModalProcessor( ...@@ -331,8 +331,6 @@ class Cohere2VisionMultiModalProcessor(
dummy_inputs=Cohere2VisionDummyInputsBuilder, dummy_inputs=Cohere2VisionDummyInputsBuilder,
) )
class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
merge_by_field_config = True
hf_to_vllm_mapper = WeightsMapper( hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={ orig_to_new_prefix={
"model.vision_tower.": "vision_tower.", "model.vision_tower.": "vision_tower.",
......
...@@ -344,8 +344,6 @@ class DeepseekOCRMultiModalProcessor( ...@@ -344,8 +344,6 @@ class DeepseekOCRMultiModalProcessor(
dummy_inputs=DeepseekOCRDummyInputsBuilder, dummy_inputs=DeepseekOCRDummyInputsBuilder,
) )
class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
merge_by_field_config = True
hf_to_vllm_mapper = WeightsMapper( hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={ orig_to_new_prefix={
# map prefix for language backbone # map prefix for language backbone
......
...@@ -344,8 +344,6 @@ class DeepseekVL2MultiModalProcessor( ...@@ -344,8 +344,6 @@ class DeepseekVL2MultiModalProcessor(
dummy_inputs=DeepseekVL2DummyInputsBuilder, dummy_inputs=DeepseekVL2DummyInputsBuilder,
) )
class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
merge_by_field_config = True
hf_to_vllm_mapper = WeightsMapper( hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={ orig_to_new_prefix={
"language.": "language_model.", "language.": "language_model.",
......
...@@ -690,8 +690,6 @@ class DotsVisionTransformer(nn.Module): ...@@ -690,8 +690,6 @@ class DotsVisionTransformer(nn.Module):
dummy_inputs=DotsOCRDummyInputsBuilder, dummy_inputs=DotsOCRDummyInputsBuilder,
) )
class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
merge_by_field_config = True
hf_to_vllm_mapper = WeightsMapper( hf_to_vllm_mapper = WeightsMapper(
orig_to_new_substr={ orig_to_new_substr={
".attn.qkv_proj.": ".attn.qkv.", ".attn.qkv_proj.": ".attn.qkv.",
......
...@@ -1254,8 +1254,6 @@ class Ernie4_5_VLDummyInputsBuilder(BaseDummyInputsBuilder[Ernie4_5_VLProcessing ...@@ -1254,8 +1254,6 @@ class Ernie4_5_VLDummyInputsBuilder(BaseDummyInputsBuilder[Ernie4_5_VLProcessing
class Ernie4_5_VLMoeForConditionalGeneration( class Ernie4_5_VLMoeForConditionalGeneration(
nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
): ):
merge_by_field_config = True
packed_modules_mapping = { packed_modules_mapping = {
"qkv_proj": [ "qkv_proj": [
"q_proj", "q_proj",
......
...@@ -260,8 +260,6 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]): ...@@ -260,8 +260,6 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
dummy_inputs=FuyuDummyInputsBuilder, dummy_inputs=FuyuDummyInputsBuilder,
) )
class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
merge_by_field_config = True
hf_to_vllm_mapper = WeightsMapper( hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={ orig_to_new_prefix={
"model.vision_embed_tokens.": "vision_embed_tokens.", "model.vision_embed_tokens.": "vision_embed_tokens.",
......
...@@ -483,8 +483,6 @@ class Gemma3MultiModalProjector(nn.Module): ...@@ -483,8 +483,6 @@ class Gemma3MultiModalProjector(nn.Module):
class Gemma3ForConditionalGeneration( class Gemma3ForConditionalGeneration(
nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
): ):
merge_by_field_config = True
packed_modules_mapping = { packed_modules_mapping = {
"qkv_proj": [ "qkv_proj": [
"q_proj", "q_proj",
......
...@@ -463,7 +463,6 @@ class Gemma3nMultimodalEmbedder(nn.Module): ...@@ -463,7 +463,6 @@ class Gemma3nMultimodalEmbedder(nn.Module):
class Gemma3nForConditionalGeneration( class Gemma3nForConditionalGeneration(
nn.Module, SupportsMultiModal, SupportsTranscription nn.Module, SupportsMultiModal, SupportsTranscription
): ):
merge_by_field_config = True
supported_languages = ISO639_1_SUPPORTED_LANGS supported_languages = ISO639_1_SUPPORTED_LANGS
packed_modules_mapping = { packed_modules_mapping = {
......
...@@ -1424,8 +1424,6 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): ...@@ -1424,8 +1424,6 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
class Glm4vForConditionalGeneration( class Glm4vForConditionalGeneration(
nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
): ):
merge_by_field_config = True
packed_modules_mapping = { packed_modules_mapping = {
"qkv_proj": [ "qkv_proj": [
"q_proj", "q_proj",
......
...@@ -561,8 +561,6 @@ class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]): ...@@ -561,8 +561,6 @@ class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
class GLM4VForCausalLM( class GLM4VForCausalLM(
ChatGLMBaseModel, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ChatGLMBaseModel, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
): ):
merge_by_field_config = True
packed_modules_mapping = { packed_modules_mapping = {
"query_key_value": ["query_key_value"], "query_key_value": ["query_key_value"],
"dense_h_to_4h": ["dense_h_to_4h"], "dense_h_to_4h": ["dense_h_to_4h"],
......
...@@ -564,7 +564,6 @@ class GraniteSpeechForConditionalGeneration( ...@@ -564,7 +564,6 @@ class GraniteSpeechForConditionalGeneration(
SupportsLoRA, SupportsLoRA,
SupportsTranscription, SupportsTranscription,
): ):
merge_by_field_config = True
supported_languages = ISO639_1_SUPPORTED_LANGS supported_languages = ISO639_1_SUPPORTED_LANGS
packed_modules_mapping = { packed_modules_mapping = {
......
...@@ -786,7 +786,6 @@ class HunYuanVLForConditionalGeneration( ...@@ -786,7 +786,6 @@ class HunYuanVLForConditionalGeneration(
SupportsQuant, SupportsQuant,
SupportsXDRoPE, SupportsXDRoPE,
): ):
merge_by_field_config = True
multimodal_cpu_fields = {"image_grid_thw"} multimodal_cpu_fields = {"image_grid_thw"}
# To ensure correct weight loading and mapping. # To ensure correct weight loading and mapping.
......
...@@ -592,8 +592,6 @@ class HCXVisionCAbstractor(nn.Module): ...@@ -592,8 +592,6 @@ class HCXVisionCAbstractor(nn.Module):
dummy_inputs=HCXVisionDummyInputsBuilder, dummy_inputs=HCXVisionDummyInputsBuilder,
) )
class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
merge_by_field_config = True
packed_modules_mapping = { packed_modules_mapping = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"], "qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"], "gate_up_proj": ["gate_proj", "up_proj"],
......
...@@ -576,8 +576,6 @@ class Idefics3Model(nn.Module): ...@@ -576,8 +576,6 @@ class Idefics3Model(nn.Module):
dummy_inputs=Idefics3DummyInputsBuilder, dummy_inputs=Idefics3DummyInputsBuilder,
) )
class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA): class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA):
merge_by_field_config = True
packed_modules_mapping = { packed_modules_mapping = {
"qkv_proj": [ "qkv_proj": [
"q_proj", "q_proj",
......
...@@ -78,9 +78,9 @@ class SupportsMultiModal(Protocol): ...@@ -78,9 +78,9 @@ class SupportsMultiModal(Protocol):
`multimodal_config.mm_encoder_tp_mode="data"`. `multimodal_config.mm_encoder_tp_mode="data"`.
""" """
merge_by_field_config: ClassVar[bool] = True merge_by_field_config: ClassVar[bool | None] = None
""" """
A flag that indicates which implementation of [DEPRECATED] A flag that indicates which implementation of
`vllm.multimodal.utils.group_mm_kwargs_by_modality` to use. `vllm.multimodal.utils.group_mm_kwargs_by_modality` to use.
""" """
...@@ -260,7 +260,26 @@ def supports_multimodal(model: object) -> TypeIs[SupportsMultiModal]: ... ...@@ -260,7 +260,26 @@ def supports_multimodal(model: object) -> TypeIs[SupportsMultiModal]: ...
def supports_multimodal( def supports_multimodal(
model: type[object] | object, model: type[object] | object,
) -> TypeIs[type[SupportsMultiModal]] | TypeIs[SupportsMultiModal]: ) -> TypeIs[type[SupportsMultiModal]] | TypeIs[SupportsMultiModal]:
return getattr(model, "supports_multimodal", False) res = getattr(model, "supports_multimodal", False)
if res:
# We can remove this starting from v0.14
merge_by_field_config = getattr(model, "merge_by_field_config", None)
if merge_by_field_config is False:
raise ValueError(
"`merge_by_field_config=False` is no longer effective, "
"please update your model to consider the new batching logic "
"in `group_mm_kwargs_by_modality` (refer to "
"https://github.com/vllm-project/vllm/issues/26149), "
"and then remove the override from your model."
)
if merge_by_field_config is True:
logger.warning_once(
"`merge_by_field_config=True` is redundant, "
"please remove the override from your model."
)
return res
def supports_multimodal_raw_input_only(model: type[object] | object) -> bool: def supports_multimodal_raw_input_only(model: type[object] | object) -> bool:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment