Unverified Commit b024a42e authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Core] Move multimodal placeholder from chat utils to model definition (#20355)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent cb97f2bf
......@@ -1257,6 +1257,15 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
"model.visual.": "visual.",
})
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<|begin_of_image|><|image|><|end_of_image|>"
if modality.startswith("video"):
return "<|begin_of_video|><|video|><|end_of_video|>"
raise ValueError("Only image or video modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config: Glm4vConfig = vllm_config.model_config.hf_config
......
......@@ -540,6 +540,13 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
connector="transformer.vision.linear_proj",
tower_model="transformer.vision.transformer")
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<|begin_of_image|><|endoftext|><|end_of_image|>"
raise ValueError("Only image modality is supported")
def __init__(
self,
*,
......
......@@ -533,6 +533,13 @@ class GraniteSpeechForConditionalGeneration(
],
}
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("audio"):
return "<|audio|>"
raise ValueError("Only audio modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str):
super().__init__()
config = vllm_config.model_config.hf_config
......
......@@ -591,6 +591,13 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
],
}
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<image>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
......
......@@ -46,6 +46,13 @@ class SupportsMultiModal(Protocol):
MRO of your model class.
"""
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
"""
Get the placeholder text for the `i`th `modality` item in the prompt.
"""
...
def get_multimodal_embeddings(self,
**kwargs: object) -> MultiModalEmbeddings:
"""
......
......@@ -1023,6 +1023,15 @@ class InternVLMultiModalProcessor(
class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
SupportsLoRA):
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<image>"
if modality.startswith("video"):
return "<video>"
raise ValueError("Only image or video modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
super().__init__()
......
......@@ -1343,6 +1343,15 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
"model.": "language_model.model.",
})
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<|vision_start|><|image_pad|><|vision_end|>"
if modality.startswith("video"):
return "<|vision_start|><|video_pad|><|vision_end|>"
raise ValueError("Only image or video modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config: PretrainedConfig = vllm_config.model_config.hf_config
......
......@@ -264,6 +264,13 @@ class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
dummy_inputs=KimiVLDummyInputsBuilder)
class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal):
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<|media_start|>image<|media_content|><|media_pad|><|media_end|>"
raise ValueError("Only image modality is supported")
def __init__(
self,
vllm_config: VllmConfig,
......
......@@ -511,6 +511,13 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
"lm_head.": "language_model.lm_head.",
})
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<image>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
super().__init__()
......
......@@ -215,6 +215,13 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
"lm_head.": "language_model.lm_head.",
})
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<image>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
super().__init__()
config = vllm_config.model_config.hf_config
......
......@@ -281,6 +281,15 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
"lm_head.": "language_model.lm_head.",
})
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<image>"
if modality.startswith("video"):
return "<video>"
raise ValueError("Only image or video modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
super().__init__()
config = vllm_config.model_config.hf_config
......
......@@ -446,6 +446,15 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
"lm_head.": "language_model.lm_head.",
})
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<image>"
if modality.startswith("video"):
return "<video>"
raise ValueError("Only image or video modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
super().__init__()
config = vllm_config.model_config.hf_config
......
......@@ -511,6 +511,17 @@ class MiniCPMO(MiniCPMV2_6):
],
}
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "(<image>./</image>)"
if modality.startswith("video"):
return "(<video>./</video>)"
if modality.startswith("audio"):
return "(<audio>./</audio>)"
raise ValueError("Only image, video or audio modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__(vllm_config=vllm_config, prefix=prefix)
self.apm = self.init_audio_module(vllm_config=vllm_config,
......
......@@ -735,6 +735,15 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
instantiated.
"""
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "(<image>./</image>)"
if modality.startswith("video"):
return "(<video>./</video>)"
raise ValueError("Only image or video modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
config = vllm_config.model_config.hf_config
multimodal_config = vllm_config.model_config.multimodal_config
......
......@@ -158,6 +158,13 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal,
"gate_up_proj": ["gate_proj", "up_proj"]
}
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<image>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
super().__init__()
......
......@@ -401,6 +401,13 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
"lm_head.": "language_model.lm_head.",
})
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return None
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
super().__init__()
......
......@@ -1276,6 +1276,13 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
},
)
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<|image|>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config: MllamaConfig = vllm_config.model_config.hf_config
......
......@@ -719,6 +719,13 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
}
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<|image|>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
......
......@@ -1366,6 +1366,13 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
"merged_linear": ["gate_proj", "up_proj"] # image_projector
}
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return None
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
......
......@@ -405,6 +405,13 @@ class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]):
dummy_inputs=OvisDummyInputsBuilder)
class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<image>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment