Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
873f384a
Unverified
Commit
873f384a
authored
Aug 05, 2025
by
Yuhao Yao
Committed by
GitHub
Aug 05, 2025
Browse files
[feat] Add detail in image_data (#8596)
parent
b01eeb80
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
33 additions
and
10 deletions
+33
-10
python/sglang/srt/conversation.py
python/sglang/srt/conversation.py
+9
-5
python/sglang/srt/jinja_template_utils.py
python/sglang/srt/jinja_template_utils.py
+8
-1
python/sglang/srt/managers/io_struct.py
python/sglang/srt/managers/io_struct.py
+2
-1
python/sglang/srt/utils.py
python/sglang/srt/utils.py
+13
-2
test/srt/test_jinja_template_utils.py
test/srt/test_jinja_template_utils.py
+1
-1
No files found.
python/sglang/srt/conversation.py
View file @
873f384a
...
...
@@ -30,8 +30,10 @@ import re
from
enum
import
IntEnum
,
auto
from
typing
import
Callable
,
Dict
,
List
,
Optional
,
Tuple
,
Union
from
typing_extensions
import
Literal
from
sglang.srt.entrypoints.openai.protocol
import
ChatCompletionRequest
from
sglang.srt.utils
import
read_system_prompt_from_file
from
sglang.srt.utils
import
ImageData
,
read_system_prompt_from_file
class
SeparatorStyle
(
IntEnum
):
...
...
@@ -91,7 +93,7 @@ class Conversation:
video_token
:
str
=
"<video>"
audio_token
:
str
=
"<audio>"
image_data
:
Optional
[
List
[
str
]]
=
None
image_data
:
Optional
[
List
[
ImageData
]]
=
None
video_data
:
Optional
[
List
[
str
]]
=
None
modalities
:
Optional
[
List
[
str
]]
=
None
stop_token_ids
:
Optional
[
int
]
=
None
...
...
@@ -381,9 +383,9 @@ class Conversation:
"""Append a new message."""
self
.
messages
.
append
([
role
,
message
])
def
append_image
(
self
,
image
:
str
):
def
append_image
(
self
,
image
:
str
,
detail
:
Literal
[
"auto"
,
"low"
,
"high"
]
):
"""Append a new image."""
self
.
image_data
.
append
(
image
)
self
.
image_data
.
append
(
ImageData
(
url
=
image
,
detail
=
detail
)
)
def
append_video
(
self
,
video
:
str
):
"""Append a new video."""
...
...
@@ -627,7 +629,9 @@ def generate_chat_conv(
real_content
=
image_token
+
real_content
else
:
real_content
+=
image_token
conv
.
append_image
(
content
.
image_url
.
url
)
conv
.
append_image
(
content
.
image_url
.
url
,
content
.
image_url
.
detail
)
elif
content
.
type
==
"video_url"
:
real_content
+=
video_token
conv
.
append_video
(
content
.
video_url
.
url
)
...
...
python/sglang/srt/jinja_template_utils.py
View file @
873f384a
...
...
@@ -9,6 +9,8 @@ import logging
import
jinja2
import
transformers.utils.chat_template_utils
as
hf_chat_utils
from
sglang.srt.utils
import
ImageData
logger
=
logging
.
getLogger
(
__name__
)
# ============================================================================
...
...
@@ -140,7 +142,12 @@ def process_content_for_template_format(
chunk_type
=
chunk
.
get
(
"type"
)
if
chunk_type
==
"image_url"
:
image_data
.
append
(
chunk
[
"image_url"
][
"url"
])
image_data
.
append
(
ImageData
(
url
=
chunk
[
"image_url"
][
"url"
],
detail
=
chunk
[
"image_url"
].
get
(
"detail"
,
"auto"
),
)
)
if
chunk
.
get
(
"modalities"
):
modalities
.
append
(
chunk
.
get
(
"modalities"
))
# Normalize to simple 'image' type for template compatibility
...
...
python/sglang/srt/managers/io_struct.py
View file @
873f384a
...
...
@@ -26,6 +26,7 @@ from sglang.srt.lora.lora_registry import LoRARef
from
sglang.srt.managers.schedule_batch
import
BaseFinishReason
from
sglang.srt.multimodal.mm_utils
import
has_valid_data
from
sglang.srt.sampling.sampling_params
import
SamplingParams
from
sglang.srt.utils
import
ImageData
# Handle serialization of Image for pydantic
if
TYPE_CHECKING
:
...
...
@@ -45,7 +46,7 @@ class SessionParams:
# Type definitions for multimodal input data
# Individual data item types for each modality
ImageDataInputItem
=
Union
[
Image
,
str
,
Dict
]
ImageDataInputItem
=
Union
[
Image
,
str
,
ImageData
,
Dict
]
AudioDataInputItem
=
Union
[
str
,
Dict
]
VideoDataInputItem
=
Union
[
str
,
Dict
]
# Union type for any multimodal data item
...
...
python/sglang/srt/utils.py
View file @
873f384a
...
...
@@ -44,6 +44,7 @@ import traceback
import
warnings
from
collections
import
OrderedDict
,
defaultdict
from
contextlib
import
contextmanager
from
dataclasses
import
dataclass
from
functools
import
lru_cache
from
importlib.metadata
import
PackageNotFoundError
,
version
from
importlib.util
import
find_spec
...
...
@@ -84,6 +85,7 @@ from torch.library import Library
from
torch.profiler
import
ProfilerActivity
,
profile
,
record_function
from
torch.utils._contextlib
import
_DecoratorContextManager
from
triton.runtime.cache
import
FileCacheManager
from
typing_extensions
import
Literal
from
sglang.srt.metrics.func_timer
import
enable_func_timer
...
...
@@ -736,9 +738,18 @@ def load_audio(
return
audio
@
dataclass
class
ImageData
:
url
:
str
detail
:
Optional
[
Literal
[
"auto"
,
"low"
,
"high"
]]
=
"auto"
def
load_image
(
image_file
:
Union
[
Image
.
Image
,
str
,
bytes
],
image_file
:
Union
[
Image
.
Image
,
str
,
ImageData
,
bytes
],
)
->
tuple
[
Image
.
Image
,
tuple
[
int
,
int
]]:
if
isinstance
(
image_file
,
ImageData
):
image_file
=
image_file
.
url
image
=
image_size
=
None
if
isinstance
(
image_file
,
Image
.
Image
):
image
=
image_file
...
...
@@ -762,7 +773,7 @@ def load_image(
elif
isinstance
(
image_file
,
str
):
image
=
Image
.
open
(
BytesIO
(
pybase64
.
b64decode
(
image_file
,
validate
=
True
)))
else
:
raise
ValueError
(
f
"Invalid image:
{
image
}
"
)
raise
ValueError
(
f
"Invalid image:
{
image
_file
}
"
)
return
image
,
image_size
...
...
test/srt/test_jinja_template_utils.py
View file @
873f384a
...
...
@@ -85,7 +85,7 @@ class TestTemplateContentFormatDetection(CustomTestCase):
# Check that image_data was extracted
self
.
assertEqual
(
len
(
image_data
),
1
)
self
.
assertEqual
(
image_data
[
0
],
"http://example.com/image.jpg"
)
self
.
assertEqual
(
image_data
[
0
]
.
url
,
"http://example.com/image.jpg"
)
# Check that content was normalized
expected_content
=
[
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment