fix: fix MMMU loading issue (#11759)

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>

fix: fix MMMU loading issue (#11759)
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
92009bd2 · Zaili Wang · GitHub · 4ef981e2 · 92009bd2
Unverified Commit 92009bd2 authored Oct 24, 2025 by Zaili Wang Committed by GitHub Oct 23, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 27 additions and 8 deletions

python/sglang/bench_serving.py python/sglang/bench_serving.py +27 -8

No files found.
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -764,6 +764,7 @@ def get_dataset(args, tokenizer, model_id=None):
            image_content=args.image_content,
            image_format=args.image_format,
            image_resolution=args.image_resolution,
+            backend=args.backend,
        )
    elif args.dataset_name == "generated-shared-prefix":
        assert not tokenize_prompt
@@ -781,6 +782,7 @@ def get_dataset(args, tokenizer, model_id=None):
        input_requests = sample_mmmu_requests(
            num_requests=args.num_prompts,
            processor=processor,
+            backend=args.backend,
            fixed_output_len=args.random_output_len,
            random_sample=True,
        )
@@ -1009,6 +1011,7 @@ async def get_mooncake_request_over_time(
 def sample_mmmu_requests(
    num_requests: int,
    processor: AutoProcessor | AutoTokenizer,
+    backend: str,
    fixed_output_len: Optional[int] = None,
    random_sample: bool = True,
 ) -> List[DatasetRow]:
@@ -1081,7 +1084,7 @@ def sample_mmmu_requests(
                text_prompt = f"Question: {question}\n\nAnswer: "
                output_len = fixed_output_len if fixed_output_len is not None else 256
                data_row = create_mm_data_row(
-                    text_prompt, [image], [image_data], output_len, processor
+                    text_prompt, [image], [image_data], output_len, processor, backend
                )
                filtered_dataset.append(data_row)

@@ -1316,13 +1319,19 @@ def parse_image_resolution(image_resolution: str) -> Tuple[int, int]:
    )


-def create_mm_data_row(text_prompt, images: list, images_base64, output_len, processor):
+def create_mm_data_row(
+    text_prompt, images: list, images_base64, output_len, processor, backend
+):
    try:
-        content_items = [
-            {"type": "image", "image": {"url": image_base64}}
-            for image_base64 in images_base64
-        ]
-        content_items.append({"type": "text", "text": text_prompt})
+        if type(processor).__name__ == "Phi4MMProcessor":
+            # <|endoftext10|> is the image token used in the phi-4-multimodal model.
+            content_items = text_prompt.replace("image 1", "|endoftext10|")
+        else:
+            content_items = [
+                {"type": "image", "image": {"url": image_base64}}
+                for image_base64 in images_base64
+            ]
+            content_items.append({"type": "text", "text": text_prompt})
        prompt_str = processor.apply_chat_template(
            [{"role": "user", "content": content_items}],
            add_generation_prompt=True,
@@ -1362,8 +1371,16 @@ def create_mm_data_row(text_prompt, images: list, images_base64, output_len, pro
    # Vision tokens = total tokens - text tokens
    vision_prompt_len = prompt_len - text_prompt_len

+    use_raw_prompt = backend in [
+        "sglang-oai",
+        "sglang-oai-chat",
+        "vllm",
+        "vllm-chat",
+        "lmdeploy",
+        "lmdeploy-chat",
+    ]
    return DatasetRow(
-        prompt=text_prompt,
+        prompt=text_prompt if use_raw_prompt else prompt_str,
        prompt_len=prompt_len,
        output_len=output_len,
        text_prompt_len=text_prompt_len,
@@ -1382,6 +1399,7 @@ def sample_image_requests(
    image_content: str,
    image_format: str,
    image_resolution: str,
+    backend: str,
 ) -> List[DatasetRow]:
    """Generate requests with images.

@@ -1447,6 +1465,7 @@ def sample_image_requests(
            list(images_base64),
            int(output_lens[i]),
            processor,
+            backend,
        )

        dataset.append(data_row)