Unverified Commit b024a42e authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Core] Move multimodal placeholder from chat utils to model definition (#20355)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent cb97f2bf
...@@ -240,6 +240,13 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -240,6 +240,13 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
"lm_head.": "language_model.lm_head.", "lm_head.": "language_model.lm_head.",
}) })
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return None
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
......
...@@ -520,6 +520,13 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, ...@@ -520,6 +520,13 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
"model.": "language_model.model.", "model.": "language_model.model.",
}) })
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return f"<|image_{i}|>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
......
...@@ -902,6 +902,15 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): ...@@ -902,6 +902,15 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
}, },
) )
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return f"<|image_{i}|>"
if modality.startswith("audio"):
return f"<|audio_{i}|>"
raise ValueError("Only image or audio modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
......
...@@ -327,6 +327,13 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo] ...@@ -327,6 +327,13 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
SupportsPP): SupportsPP):
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return None
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
......
...@@ -118,6 +118,13 @@ class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal, ...@@ -118,6 +118,13 @@ class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal,
SupportsV0Only): SupportsV0Only):
""" Prithvi Masked Autoencoder""" """ Prithvi Masked Autoencoder"""
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return None
raise ValueError("Only image modality is supported")
def _instantiate_model(self, config: dict) -> Optional[nn.Module]: def _instantiate_model(self, config: dict) -> Optional[nn.Module]:
# We might be able/need to support different tasks with this same model # We might be able/need to support different tasks with this same model
......
...@@ -717,6 +717,17 @@ class Qwen2_5OmniThinkerForConditionalGeneration( ...@@ -717,6 +717,17 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
"thinker.": "", "thinker.": "",
}) })
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<|vision_start|><|IMAGE|><|vision_end|>"
if modality.startswith("video"):
return "<|vision_start|><|VIDEO|><|vision_end|>"
if modality.startswith("audio"):
return f"Audio {i}: <|audio_bos|><|AUDIO|><|audio_eos|>"
raise ValueError("Only image, video or audio modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
thinker_config: Qwen2_5OmniThinkerConfig = ( thinker_config: Qwen2_5OmniThinkerConfig = (
......
...@@ -835,6 +835,15 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -835,6 +835,15 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
"model.": "language_model.model.", "model.": "language_model.model.",
}) })
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<|vision_start|><|image_pad|><|vision_end|>"
if modality.startswith("video"):
return "<|vision_start|><|video_pad|><|vision_end|>"
raise ValueError("Only image or video modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config
......
...@@ -251,6 +251,13 @@ class Qwen2AudioMultiModalProcessor( ...@@ -251,6 +251,13 @@ class Qwen2AudioMultiModalProcessor(
class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
SupportsPP): SupportsPP):
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("audio"):
return f"Audio {i}: <|audio_bos|><|AUDIO|><|audio_eos|>"
raise ValueError("Only audio modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
......
...@@ -1096,6 +1096,15 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -1096,6 +1096,15 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
"model.": "language_model.model.", "model.": "language_model.model.",
}) })
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<|vision_start|><|image_pad|><|vision_end|>"
if modality.startswith("video"):
return "<|vision_start|><|video_pad|><|vision_end|>"
raise ValueError("Only image or video modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config: Qwen2VLConfig = vllm_config.model_config.hf_config config: Qwen2VLConfig = vllm_config.model_config.hf_config
......
...@@ -675,6 +675,13 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA, ...@@ -675,6 +675,13 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
connector="transformer.visual.attn_pool", connector="transformer.visual.attn_pool",
tower_model="transformer.visual.transformer") tower_model="transformer.visual.transformer")
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return f"Picture {i}: <img></img>"
raise ValueError("Only image modality is supported")
def __init__( def __init__(
self, self,
*, *,
......
...@@ -648,6 +648,13 @@ class SkyworkR1VProcessingInfo(BaseSkyworkR1VProcessingInfo): ...@@ -648,6 +648,13 @@ class SkyworkR1VProcessingInfo(BaseSkyworkR1VProcessingInfo):
dummy_inputs=SkyworkR1VDummyInputsBuilder) dummy_inputs=SkyworkR1VDummyInputsBuilder)
class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP): class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<image>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
super().__init__() super().__init__()
......
...@@ -393,6 +393,13 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -393,6 +393,13 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal,
"gate_up_proj": ["gate_proj", "up_proj"] "gate_up_proj": ["gate_proj", "up_proj"]
} }
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("image"):
return "<image>"
raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
super().__init__() super().__init__()
config: TarsierHfConfig = vllm_config.model_config.hf_config config: TarsierHfConfig = vllm_config.model_config.hf_config
......
...@@ -407,6 +407,13 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): ...@@ -407,6 +407,13 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
hf_to_vllm_mapper = WeightsMapper( hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."}) orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."})
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("audio"):
return "<|audio|>"
raise ValueError("Only audio modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
......
...@@ -761,6 +761,35 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription, ...@@ -761,6 +761,35 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
".fc2.": ".mlp.fc2." ".fc2.": ".mlp.fc2."
}) })
@classmethod
def validate_language(cls, language: str) -> bool:
if language in ISO639_1_SUPPORTED_LANGS:
return True
elif language in ISO639_1_OTHER_LANGS:
logger.warning(
"The selected language %s has limited accuracy with"
" reported WER>=0.5. Results may be less accurate "
"for this choice.", language)
return True
else:
raise ValueError(f"Unsupported language: {language}."
"Language should be one of:" +
f" {list(ISO639_1_SUPPORTED_LANGS.values())}" +
f"or {list(ISO639_1_OTHER_LANGS.values())}")
@classmethod
def get_decoder_prompt(cls, language: str, task_type: str,
prompt: str) -> str:
return (f"<|startoftranscript|><|{language}|><|{task_type}|>"
f"<|notimestamps|>{prompt}")
@classmethod
def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
if modality.startswith("audio"):
return None
raise ValueError("Only audio modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
...@@ -840,28 +869,6 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription, ...@@ -840,28 +869,6 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
weights = _create_fake_bias_for_k_proj(weights) weights = _create_fake_bias_for_k_proj(weights)
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
@classmethod
def validate_language(cls, language: str) -> bool:
if language in ISO639_1_SUPPORTED_LANGS:
return True
elif language in ISO639_1_OTHER_LANGS:
logger.warning(
"The selected language %s has limited accuracy with"
" reported WER>=0.5. Results may be less accurate "
"for this choice.", language)
return True
else:
raise ValueError(f"Unsupported language: {language}."
"Language should be one of:" +
f" {list(ISO639_1_SUPPORTED_LANGS.values())}" +
f"or {list(ISO639_1_OTHER_LANGS.values())}")
@classmethod
def get_decoder_prompt(cls, language: str, task_type: str,
prompt: str) -> str:
return (f"<|startoftranscript|><|{language}|><|{task_type}|>"
f"<|notimestamps|>{prompt}")
def _create_fake_bias_for_k_proj( def _create_fake_bias_for_k_proj(
weights: Iterable[tuple[str, torch.Tensor]] weights: Iterable[tuple[str, torch.Tensor]]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment