"examples/vscode:/vscode.git/clone" did not exist on "d054da1992175787f936d18aead51bef663a0399"
Unverified Commit 0836be3b authored by Hojin Yang's avatar Hojin Yang Committed by GitHub
Browse files

[Model] Add HyperCLOVAX-SEED-Think-32B vision-language model support (#31471)


Signed-off-by: default avatareffortprogrammer <yhjhoward7@gmail.com>
Co-authored-by: default avatarCyrus Leung <tlleungac@connect.ust.hk>
parent 4e95ec11
...@@ -701,6 +701,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen ...@@ -701,6 +701,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `GlmOcrForConditionalGeneration` | GLM-OCR | T + I<sup>E+</sup> | `zai-org/GLM-OCR`, etc. | ✅︎ | ✅︎ | | `GlmOcrForConditionalGeneration` | GLM-OCR | T + I<sup>E+</sup> | `zai-org/GLM-OCR`, etc. | ✅︎ | ✅︎ |
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ |
| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | T + I<sup>+</sup> + V<sup>+</sup> | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | | `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | T + I<sup>+</sup> + V<sup>+</sup> | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | |
| `HCXVisionV2ForCausalLM` | HyperCLOVAX-SEED-Think-32B | T + I<sup>+</sup> + V<sup>+</sup> | `naver-hyperclovax/HyperCLOVAX-SEED-Think-32B` | | |
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ |
| `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + I<sup>E+</sup> | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ | | `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + I<sup>E+</sup> | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ |
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |
......
...@@ -118,7 +118,7 @@ async def test_multi_chunk_streaming( ...@@ -118,7 +118,7 @@ async def test_multi_chunk_streaming(
# JIT compilation # JIT compilation
warmup_done = False warmup_done = False
while not warmup_done: while not warmup_done:
event = await receive_event(ws, timeout=360.0) event = await receive_event(ws, timeout=600.0)
if event["type"] in ("transcription.done", "error"): if event["type"] in ("transcription.done", "error"):
warmup_done = True warmup_done = True
......
...@@ -1458,6 +1458,38 @@ def test_parse_chat_messages_context_text_format( ...@@ -1458,6 +1458,38 @@ def test_parse_chat_messages_context_text_format(
assert mm_uuids is None assert mm_uuids is None
def test_parse_chat_messages_openai_format_image_url(
phi3v_model_config,
image_url,
):
content = [
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": "What's in the image?"},
]
conversation, mm_data, mm_uuids = parse_chat_messages(
[
{
"role": "user",
"content": content,
}
],
phi3v_model_config,
content_format="openai",
)
assert conversation == [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What's in the image?"},
],
}
]
_assert_mm_data_is_image_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
def test_parse_chat_messages_rejects_too_many_images_in_one_message( def test_parse_chat_messages_rejects_too_many_images_in_one_message(
phi3v_model_config, phi3v_model_config,
image_url, image_url,
......
...@@ -313,6 +313,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { ...@@ -313,6 +313,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"HunYuanMoEV1ForCausalLM": _HfExamplesInfo( "HunYuanMoEV1ForCausalLM": _HfExamplesInfo(
"tencent/Hunyuan-A13B-Instruct", trust_remote_code=True "tencent/Hunyuan-A13B-Instruct", trust_remote_code=True
), ),
"HyperCLOVAXForCausalLM": _HfExamplesInfo(
"naver-hyperclovax/HyperCLOVAX-SEED-Think-32B",
trust_remote_code=True,
),
"InternLMForCausalLM": _HfExamplesInfo( "InternLMForCausalLM": _HfExamplesInfo(
"internlm/internlm-chat-7b", trust_remote_code=True "internlm/internlm-chat-7b", trust_remote_code=True
), ),
...@@ -793,6 +797,10 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -793,6 +797,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
trust_remote_code=True, trust_remote_code=True,
), ),
"HCXVisionV2ForCausalLM": _HfExamplesInfo(
"naver-hyperclovax/HyperCLOVAX-SEED-Think-32B",
trust_remote_code=True,
),
"HunYuanVLForConditionalGeneration": _HfExamplesInfo( "HunYuanVLForConditionalGeneration": _HfExamplesInfo(
"tencent/HunyuanOCR", "tencent/HunyuanOCR",
hf_overrides={"num_experts": 0}, hf_overrides={"num_experts": 0},
......
...@@ -1428,6 +1428,8 @@ def _parse_chat_message_content_part( ...@@ -1428,6 +1428,8 @@ def _parse_chat_message_content_part(
with multimodal placeholders. with multimodal placeholders.
""" """
if isinstance(part, str): # Handle plain text parts if isinstance(part, str): # Handle plain text parts
if wrap_dicts:
return {"type": "text", "text": part}
return part return part
# Handle structured dictionary parts # Handle structured dictionary parts
part_type, content = _parse_chat_message_content_mm_part(part) part_type, content = _parse_chat_message_content_mm_part(part)
...@@ -1487,11 +1489,9 @@ def _parse_chat_message_content_part( ...@@ -1487,11 +1489,9 @@ def _parse_chat_message_content_part(
else: else:
raise NotImplementedError(f"Unknown part type: {part_type}") raise NotImplementedError(f"Unknown part type: {part_type}")
return ( if wrap_dicts:
{"type": modality} return {"type": modality}
if wrap_dicts return MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None
else (MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None)
)
# No need to validate using Pydantic again # No need to validate using Pydantic again
......
...@@ -325,7 +325,7 @@ class HCXVisionMultiModalProcessor(BaseMultiModalProcessor[HCXVisionProcessingIn ...@@ -325,7 +325,7 @@ class HCXVisionMultiModalProcessor(BaseMultiModalProcessor[HCXVisionProcessingIn
hf_inputs: BatchFeature, hf_inputs: BatchFeature,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]: ) -> Mapping[str, MultiModalFieldConfig]:
return dict( fields = dict(
pixel_values_images=MultiModalFieldConfig.batched("image"), pixel_values_images=MultiModalFieldConfig.batched("image"),
image_sizes_images=MultiModalFieldConfig.batched("image"), image_sizes_images=MultiModalFieldConfig.batched("image"),
vision_query_lengths_images=MultiModalFieldConfig.batched("image"), vision_query_lengths_images=MultiModalFieldConfig.batched("image"),
...@@ -333,6 +333,8 @@ class HCXVisionMultiModalProcessor(BaseMultiModalProcessor[HCXVisionProcessingIn ...@@ -333,6 +333,8 @@ class HCXVisionMultiModalProcessor(BaseMultiModalProcessor[HCXVisionProcessingIn
vision_query_lengths_videos=MultiModalFieldConfig.batched("video"), vision_query_lengths_videos=MultiModalFieldConfig.batched("video"),
) )
return fields
def _build_hcxvision_hf_info( def _build_hcxvision_hf_info(
ctx: InputProcessingContext, ctx: InputProcessingContext,
...@@ -590,12 +592,26 @@ class HCXVisionCAbstractor(nn.Module): ...@@ -590,12 +592,26 @@ class HCXVisionCAbstractor(nn.Module):
dummy_inputs=HCXVisionDummyInputsBuilder, dummy_inputs=HCXVisionDummyInputsBuilder,
) )
class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
"""
HyperCLOVAX-SEED Vision-Language Model (V1 architecture).
Supports:
- HyperCLOVAX-SEED-Vision-Instruct-3B
Uses CLIP/SigLIP as the vision encoder with C-Abstractor projector.
"""
packed_modules_mapping = { packed_modules_mapping = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"], "qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"], "gate_up_proj": ["gate_proj", "up_proj"],
} }
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: def __init__(
self,
*,
vllm_config: VllmConfig,
prefix: str = "",
) -> None:
super().__init__() super().__init__()
# init configs # init configs
...@@ -647,8 +663,9 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): ...@@ -647,8 +663,9 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
self.vision_config = vision_config self.vision_config = vision_config
self.text_config = text_config self.text_config = text_config
# use_sum_loss = bool(kwargs.pop("use_sum_loss", False)) self.make_empty_intermediate_tensors = (
# self.reduction = self._init_reduction_type(use_sum_loss) self.language_model.make_empty_intermediate_tensors
)
@classmethod @classmethod
def get_placeholder_str(cls, modality: str, i: int) -> str | None: def get_placeholder_str(cls, modality: str, i: int) -> str | None:
......
This diff is collapsed.
...@@ -132,6 +132,8 @@ _TEXT_GENERATION_MODELS = { ...@@ -132,6 +132,8 @@ _TEXT_GENERATION_MODELS = {
"HunYuanMoEV1ForCausalLM": ("hunyuan_v1", "HunYuanMoEV1ForCausalLM"), "HunYuanMoEV1ForCausalLM": ("hunyuan_v1", "HunYuanMoEV1ForCausalLM"),
"HunYuanDenseV1ForCausalLM": ("hunyuan_v1", "HunYuanDenseV1ForCausalLM"), "HunYuanDenseV1ForCausalLM": ("hunyuan_v1", "HunYuanDenseV1ForCausalLM"),
"HCXVisionForCausalLM": ("hyperclovax_vision", "HCXVisionForCausalLM"), "HCXVisionForCausalLM": ("hyperclovax_vision", "HCXVisionForCausalLM"),
"HCXVisionV2ForCausalLM": ("hyperclovax_vision_v2", "HCXVisionV2ForCausalLM"),
"HyperCLOVAXForCausalLM": ("llama", "LlamaForCausalLM"),
"InternLMForCausalLM": ("llama", "LlamaForCausalLM"), "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
"InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"), "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
"InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"), "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment