fix: InternS1 don't recognize image, updates image token for InternVL processor (#9381)

Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>

fix: InternS1 don't recognize image, updates image token for InternVL processor (#9381)
Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com>
84719b52 · Xinyuan Tong · GitHub · e99729c9 · 84719b52 · 84719b52
Unverified Commit 84719b52 authored Aug 21, 2025 by Xinyuan Tong Committed by GitHub Aug 20, 2025
Showing with 9 additions and 17 deletions

python/sglang/srt/conversation.py python/sglang/srt/conversation.py +2 -15

python/sglang/srt/multimodal/processors/internvl.py python/sglang/srt/multimodal/processors/internvl.py +7 -2

No files found.
--- a/python/sglang/srt/conversation.py
+++ b/python/sglang/srt/conversation.py
@@ -625,7 +625,7 @@ def generate_chat_conv(
                        real_content += content.text
                    elif content.type == "image_url":
                        # NOTE: works for llava and intervl2_5
-                        if conv.name in ["internvl-2-5", "interns1"]:
+                        if conv.name in ["internvl-2-5"]:
                            real_content = image_token + real_content
                        else:
                            real_content += image_token
@@ -817,20 +817,7 @@ register_conv_template(
        sep_style=SeparatorStyle.MPT,
        sep="<|im_end|>\n",
        stop_str=["<|im_end|>", "<|action_end|>"],
-        image_token="<image>",
-    )
-)
-
-register_conv_template(
-    Conversation(
-        name="interns1",
-        system_template="<|im_start|>system\n{system_message}",
-        system_message="You are an AI assistant whose name is Intern-S1 (书生大模型).\n- Intern-S1 (书生大模型) is a vision-language model that is developed by Shanghai AI Laboratory (上海人工智能实验室).  It is designed to be helpful, honest, and harmless.\n- Intern-S1 (书生大模型) can understand and communicate fluently in the language chosen by the user such as English and 中文.\nYou are an expert reasoner with extensive experience in all areas. You approach problems through systematic thinking and rigorous reasoning. Your response should reflect deep understanding and precise logical thinking, making your solution path and reasoning clear to others. Please put your thinking process within <think>...</think> tags.",
-        roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
-        sep_style=SeparatorStyle.MPT,
-        sep="<|im_end|>\n",
-        stop_str=["<|im_end|>", "<|action_end|>"],
-        image_token="<image>",
+        image_token="<IMG_CONTEXT>",
    )
 )


--- a/python/sglang/srt/multimodal/processors/internvl.py
+++ b/python/sglang/srt/multimodal/processors/internvl.py
@@ -44,7 +44,7 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
        self.img_start_token_id = tokenizer.convert_tokens_to_ids(self.IMG_START_TOKEN)
        self.img_end_token_id = tokenizer.convert_tokens_to_ids(self.IMG_END_TOKEN)
        self.mm_tokens = MultimodalSpecialTokens(
-            image_token="<image>",
+            image_token="<IMG_CONTEXT>",
            image_token_id=tokenizer.convert_tokens_to_ids(self.IMG_CONTEXT_TOKEN),
        ).build(_image_processor)

@@ -218,13 +218,18 @@ class InternVLImageProcessor(BaseMultimodalProcessor):

        pixel_values = torch.cat(pixel_values, dim=0)

+        original_placeholder = "<<<__IMG_CONTEXT_PLACEHOLDER__>>>"
+        input_text = input_text.replace(self.IMG_CONTEXT_TOKEN, original_placeholder)
+
        for idx, num_patches in enumerate(num_patches_list):
            image_tokens = (
                self.IMG_START_TOKEN
                + self.IMG_CONTEXT_TOKEN * self.num_image_token * num_patches
                + self.IMG_END_TOKEN
            )
-            input_text = input_text.replace("<image>", image_tokens, 1)
+            input_text = input_text.replace(original_placeholder, image_tokens, 1)
+
+        input_text = input_text.replace(original_placeholder, self.IMG_CONTEXT_TOKEN)

        input_ids = self.tokenizer(input_text, return_tensors="pt")[
            "input_ids"