Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
4395c87a
Unverified
Commit
4395c87a
authored
Jul 17, 2025
by
Mick
Committed by
GitHub
Jul 16, 2025
Browse files
refactor: unify names of the feature field of MultimodalDataItem (#8075)
parent
c28ad199
Changes
33
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
20 additions
and
14 deletions
+20
-14
python/sglang/srt/models/vila.py
python/sglang/srt/models/vila.py
+1
-1
python/sglang/srt/multimodal/processors/base_processor.py
python/sglang/srt/multimodal/processors/base_processor.py
+7
-1
python/sglang/srt/multimodal/processors/clip.py
python/sglang/srt/multimodal/processors/clip.py
+1
-1
python/sglang/srt/multimodal/processors/deepseek_vl_v2.py
python/sglang/srt/multimodal/processors/deepseek_vl_v2.py
+1
-1
python/sglang/srt/multimodal/processors/internvl.py
python/sglang/srt/multimodal/processors/internvl.py
+1
-1
python/sglang/srt/multimodal/processors/janus_pro.py
python/sglang/srt/multimodal/processors/janus_pro.py
+1
-1
python/sglang/srt/multimodal/processors/llava.py
python/sglang/srt/multimodal/processors/llava.py
+1
-1
python/sglang/srt/multimodal/processors/minicpm.py
python/sglang/srt/multimodal/processors/minicpm.py
+2
-2
python/sglang/srt/multimodal/processors/mlama.py
python/sglang/srt/multimodal/processors/mlama.py
+1
-1
python/sglang/srt/multimodal/processors/mllama4.py
python/sglang/srt/multimodal/processors/mllama4.py
+1
-1
python/sglang/srt/multimodal/processors/phi4mm.py
python/sglang/srt/multimodal/processors/phi4mm.py
+1
-1
python/sglang/srt/multimodal/processors/pixtral.py
python/sglang/srt/multimodal/processors/pixtral.py
+1
-1
test/srt/test_vlm_accuracy.py
test/srt/test_vlm_accuracy.py
+1
-1
No files found.
python/sglang/srt/models/vila.py
View file @
4395c87a
...
...
@@ -237,7 +237,7 @@ class VILAForConditionalGeneration(nn.Module):
return
cast
(
LogitsProcessorOutput
,
output
)
def
get_image_feature
(
self
,
mm_input
:
List
[
MultimodalDataItem
])
->
Tensor
:
pixel_values
=
cast
(
Tensor
,
mm_input
[
0
].
pixel_values
)
pixel_values
=
cast
(
Tensor
,
mm_input
[
0
].
feature
)
##### BEGIN COPY modeling_vila.py #####
...
...
python/sglang/srt/multimodal/processors/base_processor.py
View file @
4395c87a
...
...
@@ -5,7 +5,6 @@ import multiprocessing as mp
import
os
import
re
from
abc
import
ABC
,
abstractmethod
from
functools
import
lru_cache
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
,
Union
import
numpy
as
np
...
...
@@ -156,6 +155,10 @@ class BaseMultimodalProcessor(ABC):
# "precomputed_features" - handled specially as it can be any modality
}
# name of the feature filed
# TODO: pass from processors
self
.
FEATURE_NAMES
=
[
"pixel_values"
,
"pixel_values_videos"
,
"audio_features"
]
def
process_mm_data
(
self
,
input_text
,
images
=
None
,
videos
=
None
,
audios
=
None
,
**
kwargs
):
...
...
@@ -524,6 +527,9 @@ class BaseMultimodalProcessor(ABC):
if
modality
not
in
items
:
items
[
modality
]
=
MultimodalDataItem
(
modality
=
modality
)
if
attr_name
in
self
.
FEATURE_NAMES
:
attr_name
=
"feature"
# Set attribute
setattr
(
items
[
modality
],
attr_name
,
value
)
...
...
python/sglang/srt/multimodal/processors/clip.py
View file @
4395c87a
...
...
@@ -26,7 +26,7 @@ class ClipImageProcessor(BaseMultimodalProcessor):
image_inputs
[
"input_ids"
]
=
image_inputs
[
"input_ids"
].
tolist
()[
0
]
image_inputs
[
"mm_items"
]
=
[
MultimodalDataItem
(
pixel_values
=
image_inputs
[
"pixel_values"
],
modality
=
Modality
.
IMAGE
feature
=
image_inputs
[
"pixel_values"
],
modality
=
Modality
.
IMAGE
)
]
...
...
python/sglang/srt/multimodal/processors/deepseek_vl_v2.py
View file @
4395c87a
...
...
@@ -68,7 +68,7 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
input_ids
=
input_ids
,
mm_token_id
=
self
.
_processor
.
image_token_id
)
item
=
MultimodalDataItem
(
pixel_values
=
res
[
"images"
],
feature
=
res
[
"images"
],
offsets
=
image_offsets
,
modality
=
Modality
.
IMAGE
,
image_emb_mask
=
images_seq_mask
,
...
...
python/sglang/srt/multimodal/processors/internvl.py
View file @
4395c87a
...
...
@@ -223,7 +223,7 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
)
items
=
[
MultimodalDataItem
(
pixel_values
=
pixel_values
,
feature
=
pixel_values
,
modality
=
Modality
.
IMAGE
,
offsets
=
image_offsets
,
)
...
...
python/sglang/srt/multimodal/processors/janus_pro.py
View file @
4395c87a
...
...
@@ -47,7 +47,7 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
return
{
"mm_items"
:
[
MultimodalDataItem
(
pixel_values
=
res
[
"pixel_values"
],
feature
=
res
[
"pixel_values"
],
image_emb_mask
=
res
[
"images_emb_mask"
],
offsets
=
image_offsets
,
modality
=
Modality
.
IMAGE
,
...
...
python/sglang/srt/multimodal/processors/llava.py
View file @
4395c87a
...
...
@@ -158,7 +158,7 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
return
{
"mm_items"
:
[
MultimodalDataItem
(
pixel_values
=
pixel_values
,
feature
=
pixel_values
,
image_sizes
=
image_sizes
,
modality
=
modality
,
)
...
...
python/sglang/srt/multimodal/processors/minicpm.py
View file @
4395c87a
...
...
@@ -114,7 +114,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
if
len
(
pixel_values
)
!=
0
:
item
=
MultimodalDataItem
(
pixel_values
=
pixel_values
,
feature
=
pixel_values
,
offsets
=
image_offsets
,
tgt_size
=
tgt_sizes_flat
,
modality
=
Modality
.
IMAGE
,
...
...
@@ -135,7 +135,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
else
:
audio_offsets
=
None
item
=
MultimodalDataItem
(
audio_
feature
s
=
[
res
[
"audio_features"
]],
feature
=
[
res
[
"audio_features"
]],
audio_feature_lens
=
res
[
"audio_feature_lens"
],
offsets
=
audio_offsets
,
modality
=
Modality
.
AUDIO
,
...
...
python/sglang/srt/multimodal/processors/mlama.py
View file @
4395c87a
...
...
@@ -24,7 +24,7 @@ class MllamaImageProcessor(BaseMultimodalProcessor):
image_inputs
[
"input_ids"
]
=
image_inputs
[
"input_ids"
].
tolist
()[
0
]
image_inputs
[
"mm_items"
]
=
[
MultimodalDataItem
(
pixel_values
=
image_inputs
[
"pixel_values"
],
feature
=
image_inputs
[
"pixel_values"
],
aspect_ratio_id
=
image_inputs
[
"aspect_ratio_ids"
],
aspect_ratio_mask
=
image_inputs
[
"aspect_ratio_mask"
],
modality
=
Modality
.
IMAGE
,
...
...
python/sglang/srt/multimodal/processors/mllama4.py
View file @
4395c87a
...
...
@@ -142,7 +142,7 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
# Add metadata for image processing
processor_output
[
"mm_items"
]
=
[
MultimodalDataItem
(
pixel_values
=
processor_output
[
"pixel_values"
],
feature
=
processor_output
[
"pixel_values"
],
modality
=
Modality
.
IMAGE
,
offsets
=
image_offsets
,
)
...
...
python/sglang/srt/multimodal/processors/phi4mm.py
View file @
4395c87a
...
...
@@ -62,7 +62,7 @@ class Phi4MMImageProcessor(BaseMultimodalProcessor):
items
=
[
MultimodalDataItem
(
pixel_values
=
res
[
"input_image_embeds"
],
feature
=
res
[
"input_image_embeds"
],
image_sizes
=
res
[
"image_sizes"
],
image_emb_mask
=
res
[
"image_attention_mask"
],
offsets
=
image_offsets
,
...
...
python/sglang/srt/multimodal/processors/pixtral.py
View file @
4395c87a
...
...
@@ -103,7 +103,7 @@ class PixtralProcessor(BaseMultimodalProcessor):
)
mm_items
=
[
MultimodalDataItem
(
pixel_values
=
processor_output
[
"pixel_values"
],
feature
=
processor_output
[
"pixel_values"
],
image_sizes
=
processor_output
[
"image_sizes"
],
modality
=
Modality
.
IMAGE
,
offsets
=
image_offsets
,
...
...
test/srt/test_vlm_accuracy.py
View file @
4395c87a
...
...
@@ -245,7 +245,7 @@ class TestMiniCPMVLogits(VisionLLMLogitsBase):
MultimodalInputs
(
mm_items
=
[
MultimodalDataItem
(
pixel_values
=
pixel_values_flat
,
feature
=
pixel_values_flat
,
offsets
=
image_offsets
,
tgt_size
=
tgt_sizes_flat
,
modality
=
Modality
.
IMAGE
,
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment