Fix llama4 vision (#7840)

Signed-off-by: Xinyuan Tong <justinning0323@outlook.com>

Fix llama4 vision (#7840)
Signed-off-by: Xinyuan Tong <justinning0323@outlook.com>
4bab50a6 · Xinyuan Tong · GitHub · 2e7ab862 · 4bab50a6 · 4bab50a6
Unverified Commit 4bab50a6 authored Jul 08, 2025 by Xinyuan Tong Committed by GitHub Jul 08, 2025
3 changed files
--- a/python/sglang/srt/conversation.py
+++ b/python/sglang/srt/conversation.py
@@ -935,6 +935,19 @@ register_conv_template(
    )
 )
+register_conv_template(
+    Conversation(
+        name="llama_4_vision",
+        system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
+        system_template="<|header_start|>system<|header_end|>\n\n{system_message}<|eot|>",
+        roles=("user", "assistant"),
+        sep_style=SeparatorStyle.LLAMA4,
+        sep="",
+        stop_str="<|eot|>",
+        image_token="<|image|>",
+    )
+)
 @register_conv_template_matching_function
 def match_internvl(model_path: str):
@@ -943,9 +956,11 @@ def match_internvl(model_path: str):
 @register_conv_template_matching_function
-def match_llama_3_vision(model_path: str):
+def match_llama_vision(model_path: str):
    if re.search(r"llama.*3\.2.*vision", model_path, re.IGNORECASE):
        return "llama_3_vision"
+    if re.search(r"llama.*4.*", model_path, re.IGNORECASE):
+        return "llama_4_vision"
 @register_conv_template_matching_function

--- a/python/sglang/srt/managers/mm_utils.py
+++ b/python/sglang/srt/managers/mm_utils.py
@@ -248,7 +248,9 @@ def _get_chunked_prefill_embedding(
 ) -> Optional[torch.Tensor]:
    # Calculate embedding for each request, try to get it from cache to avoid repeated calculation
    embedding_list = []
-    for i in range(len(items_size) - 1):
+    # FIXME(Xinyuan): temporary workaround for eagle3, which may have len(items_size) > len(prefix_length)
+    max_iterations = min(len(items_size) - 1, len(prefix_length))
+    for i in range(max_iterations):
        if items_size[i] == items_size[i + 1]:
            continue
        embedding_items_per_req = embedding_items[items_size[i] : items_size[i + 1]]
@@ -269,7 +271,7 @@ def _get_chunked_prefill_embedding(
        embedding_per_req_chunk, _, end_index = get_embedding_chunk(
            embedding=embedding_per_req,
            extend_prefix_len=prefix_length[i],
-            extend_seq_len=extend_length[i],
+            extend_seq_len=extend_length[i] if i < len(extend_length) else 0,
            items_offset=items_offset,
        )
        # remove this item from cache if chunk reaches to the end

--- a/python/sglang/srt/multimodal/processors/mllama4.py
+++ b/python/sglang/srt/multimodal/processors/mllama4.py
@@ -60,7 +60,9 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
        )
        # Handle image resolutions and aspect ratios
-        if "pixel_values" in processor_output:
+        if "pixel_values" not in processor_output:  # no image processed
+            return None
        image_processor = processor.image_processor
        tokenizer = self._processor.tokenizer