[Model] Add HyperCLOVAX-SEED-Think-32B vision-language model support (#31471)

Signed-off-by: effortprogrammer <yhjhoward7@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>

[Model] Add HyperCLOVAX-SEED-Think-32B vision-language model support (#31471)
Signed-off-by: effortprogrammer <yhjhoward7@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
0836be3b · Hojin Yang · GitHub · 4e95ec11 · 0836be3b · 0836be3b
Unverified Commit 0836be3b authored Mar 10, 2026 by Hojin Yang Committed by GitHub Mar 10, 2026
8 changed files
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -701,6 +701,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `GlmOcrForConditionalGeneration` | GLM-OCR | T + I<sup>E+</sup> | `zai-org/GLM-OCR`, etc. | ✅︎ | ✅︎ |
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ |
 | `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | T + I<sup>+</sup> + V<sup>+</sup> | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | |
+| `HCXVisionV2ForCausalLM` | HyperCLOVAX-SEED-Think-32B | T + I<sup>+</sup> + V<sup>+</sup> | `naver-hyperclovax/HyperCLOVAX-SEED-Think-32B` | | |
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ |
 | `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + I<sup>E+</sup> | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |

--- a/tests/entrypoints/openai/test_realtime_validation.py
+++ b/tests/entrypoints/openai/test_realtime_validation.py
@@ -118,7 +118,7 @@ async def test_multi_chunk_streaming(
            # JIT compilation
            warmup_done = False
            while not warmup_done:
-                event = await receive_event(ws, timeout=360.0)
+                event = await receive_event(ws, timeout=600.0)
                if event["type"] in ("transcription.done", "error"):
                    warmup_done = True

--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -1458,6 +1458,38 @@ def test_parse_chat_messages_context_text_format(
    assert mm_uuids is None
+def test_parse_chat_messages_openai_format_image_url(
+    phi3v_model_config,
+    image_url,
+):
+    content = [
+        {"type": "image_url", "image_url": {"url": image_url}},
+        {"type": "text", "text": "What's in the image?"},
+    ]
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role": "user",
+                "content": content,
+            }
+        ],
+        phi3v_model_config,
+        content_format="openai",
+    )
+    assert conversation == [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": "What's in the image?"},
+            ],
+        }
+    ]
+    _assert_mm_data_is_image_input(mm_data, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
 def test_parse_chat_messages_rejects_too_many_images_in_one_message(
    phi3v_model_config,
    image_url,

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -313,6 +313,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "HunYuanMoEV1ForCausalLM": _HfExamplesInfo(
        "tencent/Hunyuan-A13B-Instruct", trust_remote_code=True
    ),
+    "HyperCLOVAXForCausalLM": _HfExamplesInfo(
+        "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B",
+        trust_remote_code=True,
+    ),
    "InternLMForCausalLM": _HfExamplesInfo(
        "internlm/internlm-chat-7b", trust_remote_code=True
    ),
@@ -793,6 +797,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
        trust_remote_code=True,
    ),
+    "HCXVisionV2ForCausalLM": _HfExamplesInfo(
+        "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B",
+        trust_remote_code=True,
+    ),
    "HunYuanVLForConditionalGeneration": _HfExamplesInfo(
        "tencent/HunyuanOCR",
        hf_overrides={"num_experts": 0},

--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1428,6 +1428,8 @@ def _parse_chat_message_content_part(
    with multimodal placeholders.
    """
    if isinstance(part, str):  # Handle plain text parts
+        if wrap_dicts:
+            return {"type": "text", "text": part}
        return part
    # Handle structured dictionary parts
    part_type, content = _parse_chat_message_content_mm_part(part)
@@ -1487,11 +1489,9 @@ def _parse_chat_message_content_part(
    else:
        raise NotImplementedError(f"Unknown part type: {part_type}")
-    return (
+    if wrap_dicts:
-        {"type": modality}
+        return {"type": modality}
-        if wrap_dicts
+    return MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None
-        else (MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None)
-    )
 # No need to validate using Pydantic again

--- a/vllm/model_executor/models/hyperclovax_vision.py
+++ b/vllm/model_executor/models/hyperclovax_vision.py
@@ -325,7 +325,7 @@ class HCXVisionMultiModalProcessor(BaseMultiModalProcessor[HCXVisionProcessingIn
        hf_inputs: BatchFeature,
        hf_processor_mm_kwargs: Mapping[str, object],
    ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(
+        fields = dict(
            pixel_values_images=MultiModalFieldConfig.batched("image"),
            image_sizes_images=MultiModalFieldConfig.batched("image"),
            vision_query_lengths_images=MultiModalFieldConfig.batched("image"),
@@ -333,6 +333,8 @@ class HCXVisionMultiModalProcessor(BaseMultiModalProcessor[HCXVisionProcessingIn
            vision_query_lengths_videos=MultiModalFieldConfig.batched("video"),
        )
+        return fields
 def _build_hcxvision_hf_info(
    ctx: InputProcessingContext,
@@ -590,12 +592,26 @@ class HCXVisionCAbstractor(nn.Module):
    dummy_inputs=HCXVisionDummyInputsBuilder,
 )
 class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
+    """
+    HyperCLOVAX-SEED Vision-Language Model (V1 architecture).
+    Supports:
+    - HyperCLOVAX-SEED-Vision-Instruct-3B
+    Uses CLIP/SigLIP as the vision encoder with C-Abstractor projector.
+    """
    packed_modules_mapping = {
        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
        "gate_up_proj": ["gate_proj", "up_proj"],
    }
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
        super().__init__()
        # init configs
@@ -647,8 +663,9 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
        self.vision_config = vision_config
        self.text_config = text_config
-        # use_sum_loss = bool(kwargs.pop("use_sum_loss", False))
+        self.make_empty_intermediate_tensors = (
-        # self.reduction = self._init_reduction_type(use_sum_loss)
+            self.language_model.make_empty_intermediate_tensors
+        )
    @classmethod
    def get_placeholder_str(cls, modality: str, i: int) -> str | None:

--- a/vllm/model_executor/models/hyperclovax_vision_v2.py
+++ b/vllm/model_executor/models/hyperclovax_vision_v2.py
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -132,6 +132,8 @@ _TEXT_GENERATION_MODELS = {
    "HunYuanMoEV1ForCausalLM": ("hunyuan_v1", "HunYuanMoEV1ForCausalLM"),
    "HunYuanDenseV1ForCausalLM": ("hunyuan_v1", "HunYuanDenseV1ForCausalLM"),
    "HCXVisionForCausalLM": ("hyperclovax_vision", "HCXVisionForCausalLM"),
+    "HCXVisionV2ForCausalLM": ("hyperclovax_vision_v2", "HCXVisionV2ForCausalLM"),
+    "HyperCLOVAXForCausalLM": ("llama", "LlamaForCausalLM"),
    "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
    "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
    "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"),