[feat] Add detail in image_data (#8596)

873f384a · Yuhao Yao · GitHub · b01eeb80 · 873f384a · 873f384a
Unverified Commit 873f384a authored Aug 05, 2025 by Yuhao Yao Committed by GitHub Aug 05, 2025
5 changed files
--- a/python/sglang/srt/conversation.py
+++ b/python/sglang/srt/conversation.py
@@ -30,8 +30,10 @@ import re
 from enum import IntEnum, auto
 from typing import Callable, Dict, List, Optional, Tuple, Union

+from typing_extensions import Literal
+
 from sglang.srt.entrypoints.openai.protocol import ChatCompletionRequest
-from sglang.srt.utils import read_system_prompt_from_file
+from sglang.srt.utils import ImageData, read_system_prompt_from_file


 class SeparatorStyle(IntEnum):
@@ -91,7 +93,7 @@ class Conversation:
    video_token: str = "<video>"
    audio_token: str = "<audio>"

-    image_data: Optional[List[str]] = None
+    image_data: Optional[List[ImageData]] = None
    video_data: Optional[List[str]] = None
    modalities: Optional[List[str]] = None
    stop_token_ids: Optional[int] = None
@@ -381,9 +383,9 @@ class Conversation:
        """Append a new message."""
        self.messages.append([role, message])

-    def append_image(self, image: str):
+    def append_image(self, image: str, detail: Literal["auto", "low", "high"]):
        """Append a new image."""
-        self.image_data.append(image)
+        self.image_data.append(ImageData(url=image, detail=detail))

    def append_video(self, video: str):
        """Append a new video."""
@@ -627,7 +629,9 @@ def generate_chat_conv(
                            real_content = image_token + real_content
                        else:
                            real_content += image_token
-                        conv.append_image(content.image_url.url)
+                        conv.append_image(
+                            content.image_url.url, content.image_url.detail
+                        )
                    elif content.type == "video_url":
                        real_content += video_token
                        conv.append_video(content.video_url.url)

--- a/python/sglang/srt/jinja_template_utils.py
+++ b/python/sglang/srt/jinja_template_utils.py
@@ -9,6 +9,8 @@ import logging
 import jinja2
 import transformers.utils.chat_template_utils as hf_chat_utils

+from sglang.srt.utils import ImageData
+
 logger = logging.getLogger(__name__)

 # ============================================================================
@@ -140,7 +142,12 @@ def process_content_for_template_format(
                chunk_type = chunk.get("type")

                if chunk_type == "image_url":
-                    image_data.append(chunk["image_url"]["url"])
+                    image_data.append(
+                        ImageData(
+                            url=chunk["image_url"]["url"],
+                            detail=chunk["image_url"].get("detail", "auto"),
+                        )
+                    )
                    if chunk.get("modalities"):
                        modalities.append(chunk.get("modalities"))
                    # Normalize to simple 'image' type for template compatibility

--- a/python/sglang/srt/managers/io_struct.py
+++ b/python/sglang/srt/managers/io_struct.py
@@ -26,6 +26,7 @@ from sglang.srt.lora.lora_registry import LoRARef
 from sglang.srt.managers.schedule_batch import BaseFinishReason
 from sglang.srt.multimodal.mm_utils import has_valid_data
 from sglang.srt.sampling.sampling_params import SamplingParams
+from sglang.srt.utils import ImageData

 # Handle serialization of Image for pydantic
 if TYPE_CHECKING:
@@ -45,7 +46,7 @@ class SessionParams:

 # Type definitions for multimodal input data
 # Individual data item types for each modality
-ImageDataInputItem = Union[Image, str, Dict]
+ImageDataInputItem = Union[Image, str, ImageData, Dict]
 AudioDataInputItem = Union[str, Dict]
 VideoDataInputItem = Union[str, Dict]
 # Union type for any multimodal data item

--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -44,6 +44,7 @@ import traceback
 import warnings
 from collections import OrderedDict, defaultdict
 from contextlib import contextmanager
+from dataclasses import dataclass
 from functools import lru_cache
 from importlib.metadata import PackageNotFoundError, version
 from importlib.util import find_spec
@@ -84,6 +85,7 @@ from torch.library import Library
 from torch.profiler import ProfilerActivity, profile, record_function
 from torch.utils._contextlib import _DecoratorContextManager
 from triton.runtime.cache import FileCacheManager
+from typing_extensions import Literal

 from sglang.srt.metrics.func_timer import enable_func_timer

@@ -736,9 +738,18 @@ def load_audio(
    return audio


+@dataclass
+class ImageData:
+    url: str
+    detail: Optional[Literal["auto", "low", "high"]] = "auto"
+
+
 def load_image(
-    image_file: Union[Image.Image, str, bytes],
+    image_file: Union[Image.Image, str, ImageData, bytes],
 ) -> tuple[Image.Image, tuple[int, int]]:
+    if isinstance(image_file, ImageData):
+        image_file = image_file.url
+
    image = image_size = None
    if isinstance(image_file, Image.Image):
        image = image_file
@@ -762,7 +773,7 @@ def load_image(
    elif isinstance(image_file, str):
        image = Image.open(BytesIO(pybase64.b64decode(image_file, validate=True)))
    else:
-        raise ValueError(f"Invalid image: {image}")
+        raise ValueError(f"Invalid image: {image_file}")

    return image, image_size


--- a/test/srt/test_jinja_template_utils.py
+++ b/test/srt/test_jinja_template_utils.py
@@ -85,7 +85,7 @@ class TestTemplateContentFormatDetection(CustomTestCase):

        # Check that image_data was extracted
        self.assertEqual(len(image_data), 1)
-        self.assertEqual(image_data[0], "http://example.com/image.jpg")
+        self.assertEqual(image_data[0].url, "http://example.com/image.jpg")

        # Check that content was normalized
        expected_content = [