Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
04eb88dc
Unverified
Commit
04eb88dc
authored
May 22, 2025
by
Chenheli Hua
Committed by
GitHub
May 23, 2025
Browse files
Re-submit: Fix: Proper RGBA -> RGB conversion for PIL images. (#18569)
Signed-off-by:
Chenheli Hua
<
huachenheli@outlook.com
>
parent
46791e1b
Changes
15
Show whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
89 additions
and
20 deletions
+89
-20
benchmarks/benchmark_dataset.py
benchmarks/benchmark_dataset.py
+2
-1
examples/offline_inference/qwen2_5_omni/only_thinker.py
examples/offline_inference/qwen2_5_omni/only_thinker.py
+3
-1
examples/offline_inference/vision_language.py
examples/offline_inference/vision_language.py
+3
-2
tests/models/multimodal/generation/test_interleaved.py
tests/models/multimodal/generation/test_interleaved.py
+4
-2
tests/models/multimodal/generation/test_phi4mm.py
tests/models/multimodal/generation/test_phi4mm.py
+2
-2
tests/models/test_oot_registration.py
tests/models/test_oot_registration.py
+2
-1
tests/multimodal/assets/rgba.png
tests/multimodal/assets/rgba.png
+0
-0
tests/multimodal/test_image.py
tests/multimodal/test_image.py
+36
-0
tests/multimodal/test_utils.py
tests/multimodal/test_utils.py
+2
-1
vllm/benchmarks/datasets.py
vllm/benchmarks/datasets.py
+2
-2
vllm/model_executor/models/internvl.py
vllm/model_executor/models/internvl.py
+2
-1
vllm/model_executor/models/skyworkr1v.py
vllm/model_executor/models/skyworkr1v.py
+2
-1
vllm/multimodal/hasher.py
vllm/multimodal/hasher.py
+3
-1
vllm/multimodal/image.py
vllm/multimodal/image.py
+22
-3
vllm/transformers_utils/processors/ovis.py
vllm/transformers_utils/processors/ovis.py
+4
-2
No files found.
benchmarks/benchmark_dataset.py
View file @
04eb88dc
...
...
@@ -35,6 +35,7 @@ from transformers import PreTrainedTokenizerBase
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.utils
import
get_adapter_absolute_path
from
vllm.multimodal
import
MultiModalDataDict
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
,
get_lora_tokenizer
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -257,7 +258,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
if
isinstance
(
image
,
dict
)
and
"bytes"
in
image
:
image
=
Image
.
open
(
BytesIO
(
image
[
"bytes"
]))
if
isinstance
(
image
,
Image
.
Image
):
image
=
image
.
convert
(
"RGB"
)
image
=
convert
_image_mode
(
image
,
"RGB"
)
with
io
.
BytesIO
()
as
image_data
:
image
.
save
(
image_data
,
format
=
"JPEG"
)
image_base64
=
base64
.
b64encode
(
image_data
.
getvalue
()).
decode
(
"utf-8"
)
...
...
examples/offline_inference/qwen2_5_omni/only_thinker.py
View file @
04eb88dc
...
...
@@ -11,6 +11,7 @@ from vllm import LLM, SamplingParams
from
vllm.assets.audio
import
AudioAsset
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.utils
import
FlexibleArgumentParser
...
...
@@ -45,7 +46,8 @@ def get_mixed_modalities_query() -> QueryResult:
"audio"
:
AudioAsset
(
"mary_had_lamb"
).
audio_and_sample_rate
,
"image"
:
ImageAsset
(
"cherry_blossom"
).
pil_image
.
convert
(
"RGB"
),
convert_image_mode
(
ImageAsset
(
"cherry_blossom"
).
pil_image
,
"RGB"
),
"video"
:
VideoAsset
(
name
=
"baby_reading"
,
num_frames
=
16
).
np_ndarrays
,
},
...
...
examples/offline_inference/vision_language.py
View file @
04eb88dc
...
...
@@ -19,6 +19,7 @@ from vllm import LLM, EngineArgs, SamplingParams
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.lora.request
import
LoRARequest
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.utils
import
FlexibleArgumentParser
...
...
@@ -1096,8 +1097,8 @@ def get_multi_modal_input(args):
"""
if
args
.
modality
==
"image"
:
# Input image and question
image
=
ImageAsset
(
"cherry_blossom"
)
\
.
pil_image
.
convert
(
"RGB"
)
image
=
convert_image_mode
(
ImageAsset
(
"cherry_blossom"
)
.
pil_image
,
"RGB"
)
img_questions
=
[
"What is the content of this image?"
,
"Describe the content of this image in detail."
,
...
...
tests/models/multimodal/generation/test_interleaved.py
View file @
04eb88dc
...
...
@@ -4,6 +4,7 @@ import pytest
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.multimodal.image
import
convert_image_mode
models
=
[
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
]
...
...
@@ -26,8 +27,9 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
give the same result.
"""
image_cherry
=
ImageAsset
(
"cherry_blossom"
).
pil_image
.
convert
(
"RGB"
)
image_stop
=
ImageAsset
(
"stop_sign"
).
pil_image
.
convert
(
"RGB"
)
image_cherry
=
convert_image_mode
(
ImageAsset
(
"cherry_blossom"
).
pil_image
,
"RGB"
)
image_stop
=
convert_image_mode
(
ImageAsset
(
"stop_sign"
).
pil_image
,
"RGB"
)
images
=
[
image_cherry
,
image_stop
]
video
=
VideoAsset
(
name
=
"baby_reading"
,
num_frames
=
16
).
np_ndarrays
...
...
tests/models/multimodal/generation/test_phi4mm.py
View file @
04eb88dc
...
...
@@ -12,7 +12,7 @@ from transformers import AutoTokenizer
from
vllm.assets.image
import
ImageAsset
from
vllm.lora.request
import
LoRARequest
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.image
import
convert_image_mode
,
rescale_image_size
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
SampleLogprobs
...
...
@@ -267,7 +267,7 @@ def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
# use the example speech question so that the model outputs are reasonable
audio
=
librosa
.
load
(
speech_question
,
sr
=
None
)
image
=
ImageAsset
(
"cherry_blossom"
).
pil_image
.
convert
(
"RGB"
)
image
=
convert_image_mode
(
ImageAsset
(
"cherry_blossom"
).
pil_image
,
"RGB"
)
inputs_vision_speech
=
[
(
...
...
tests/models/test_oot_registration.py
View file @
04eb88dc
...
...
@@ -4,6 +4,7 @@ import pytest
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.image
import
ImageAsset
from
vllm.multimodal.image
import
convert_image_mode
from
..utils
import
create_new_process_for_each_test
...
...
@@ -58,7 +59,7 @@ def test_oot_registration_embedding(
assert
all
(
v
==
0
for
v
in
output
.
outputs
.
embedding
)
image
=
ImageAsset
(
"cherry_blossom"
).
pil_image
.
convert
(
"RGB"
)
image
=
convert_image_mode
(
ImageAsset
(
"cherry_blossom"
).
pil_image
,
"RGB"
)
@
create_new_process_for_each_test
()
...
...
tests/multimodal/assets/rgba.png
0 → 100644
View file @
04eb88dc
219 KB
tests/multimodal/test_image.py
0 → 100644
View file @
04eb88dc
# SPDX-License-Identifier: Apache-2.0
from
pathlib
import
Path
import
numpy
as
np
from
PIL
import
Image
,
ImageChops
from
vllm.multimodal.image
import
convert_image_mode
ASSETS_DIR
=
Path
(
__file__
).
parent
/
"assets"
assert
ASSETS_DIR
.
exists
()
def
test_rgb_to_rgb
():
# Start with an RGB image.
original_image
=
Image
.
open
(
ASSETS_DIR
/
"image1.png"
).
convert
(
"RGB"
)
converted_image
=
convert_image_mode
(
original_image
,
"RGB"
)
# RGB to RGB should be a no-op.
diff
=
ImageChops
.
difference
(
original_image
,
converted_image
)
assert
diff
.
getbbox
()
is
None
def
test_rgba_to_rgb
():
original_image
=
Image
.
open
(
ASSETS_DIR
/
"rgba.png"
)
original_image_numpy
=
np
.
array
(
original_image
)
converted_image
=
convert_image_mode
(
original_image
,
"RGB"
)
converted_image_numpy
=
np
.
array
(
converted_image
)
for
i
in
range
(
original_image_numpy
.
shape
[
0
]):
for
j
in
range
(
original_image_numpy
.
shape
[
1
]):
# Verify that all transparent pixels are converted to white.
if
original_image_numpy
[
i
][
j
][
3
]
==
0
:
assert
converted_image_numpy
[
i
][
j
][
0
]
==
255
assert
converted_image_numpy
[
i
][
j
][
1
]
==
255
assert
converted_image_numpy
[
i
][
j
][
2
]
==
255
tests/multimodal/test_utils.py
View file @
04eb88dc
...
...
@@ -10,6 +10,7 @@ import numpy as np
import
pytest
from
PIL
import
Image
,
ImageChops
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.inputs
import
PlaceholderRange
from
vllm.multimodal.utils
import
(
MediaConnector
,
merge_and_sort_multimodal_metadata
)
...
...
@@ -53,7 +54,7 @@ def get_supported_suffixes() -> tuple[str, ...]:
def
_image_equals
(
a
:
Image
.
Image
,
b
:
Image
.
Image
)
->
bool
:
return
(
np
.
asarray
(
a
)
==
np
.
asarray
(
b
.
convert
(
a
.
mode
))).
all
()
return
(
np
.
asarray
(
a
)
==
np
.
asarray
(
convert
_image_mode
(
b
,
a
.
mode
))).
all
()
@
pytest
.
mark
.
asyncio
...
...
vllm/benchmarks/datasets.py
View file @
04eb88dc
...
...
@@ -13,7 +13,6 @@ generation. Supported dataset types include:
TODO: Implement CustomDataset to parse a JSON file and convert its contents into
SampleRequest instances, similar to the approach used in ShareGPT.
"""
import
base64
import
io
import
json
...
...
@@ -33,6 +32,7 @@ from transformers import PreTrainedTokenizerBase
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.utils
import
get_adapter_absolute_path
from
vllm.multimodal
import
MultiModalDataDict
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
,
get_lora_tokenizer
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -259,7 +259,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
if
isinstance
(
image
,
dict
)
and
'bytes'
in
image
:
image
=
Image
.
open
(
BytesIO
(
image
[
'bytes'
]))
if
isinstance
(
image
,
Image
.
Image
):
image
=
image
.
convert
(
"RGB"
)
image
=
convert
_image_mode
(
image
,
"RGB"
)
with
io
.
BytesIO
()
as
image_data
:
image
.
save
(
image_data
,
format
=
"JPEG"
)
image_base64
=
base64
.
b64encode
(
...
...
vllm/model_executor/models/internvl.py
View file @
04eb88dc
...
...
@@ -23,6 +23,7 @@ from vllm.model_executor.models.intern_vit import (InternVisionModel,
InternVisionPatchModel
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalKwargs
,
NestedTensors
)
from
vllm.multimodal.parse
import
(
ImageEmbeddingItems
,
ImageProcessorItems
,
...
...
@@ -77,7 +78,7 @@ InternVLImageInputs = Union[InternVLImagePixelInputs,
def
build_transform
(
input_size
:
int
):
MEAN
,
STD
=
IMAGENET_MEAN
,
IMAGENET_STD
return
T
.
Compose
([
T
.
Lambda
(
lambda
img
:
img
.
convert
(
'RGB'
)
if
img
.
mode
!=
'RGB'
else
img
),
T
.
Lambda
(
lambda
img
:
convert
_image_mode
(
img
,
'RGB'
)
),
T
.
Resize
((
input_size
,
input_size
),
interpolation
=
T
.
InterpolationMode
.
BICUBIC
),
T
.
ToTensor
(),
...
...
vllm/model_executor/models/skyworkr1v.py
View file @
04eb88dc
...
...
@@ -24,6 +24,7 @@ from vllm.model_executor.models.intern_vit import (InternVisionModel,
InternVisionPatchModel
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
convert_image_mode
from
vllm.multimodal.inputs
import
(
MultiModalDataDict
,
MultiModalFieldConfig
,
MultiModalKwargs
,
NestedTensors
)
from
vllm.multimodal.parse
import
(
ImageEmbeddingItems
,
ImageProcessorItems
,
...
...
@@ -78,7 +79,7 @@ SkyworkR1VImageInputs = Union[SkyworkR1VImagePixelInputs,
def
build_transform
(
input_size
:
int
):
MEAN
,
STD
=
IMAGENET_MEAN
,
IMAGENET_STD
return
T
.
Compose
([
T
.
Lambda
(
lambda
img
:
img
.
convert
(
'RGB'
)
if
img
.
mode
!=
'RGB'
else
img
),
T
.
Lambda
(
lambda
img
:
convert
_image_mode
(
img
,
'RGB'
)
),
T
.
Resize
((
input_size
,
input_size
),
interpolation
=
T
.
InterpolationMode
.
BICUBIC
),
T
.
ToTensor
(),
...
...
vllm/multimodal/hasher.py
View file @
04eb88dc
...
...
@@ -10,6 +10,7 @@ from blake3 import blake3
from
PIL
import
Image
from
vllm.logger
import
init_logger
from
vllm.multimodal.image
import
convert_image_mode
if
TYPE_CHECKING
:
from
vllm.inputs
import
TokensPrompt
...
...
@@ -35,7 +36,8 @@ class MultiModalHasher:
return
np
.
array
(
obj
).
tobytes
()
if
isinstance
(
obj
,
Image
.
Image
):
return
cls
.
item_to_bytes
(
"image"
,
np
.
array
(
obj
.
convert
(
"RGBA"
)))
return
cls
.
item_to_bytes
(
"image"
,
np
.
array
(
convert_image_mode
(
obj
,
"RGBA"
)))
if
isinstance
(
obj
,
torch
.
Tensor
):
return
cls
.
item_to_bytes
(
"tensor"
,
obj
.
numpy
())
if
isinstance
(
obj
,
np
.
ndarray
):
...
...
vllm/multimodal/image.py
View file @
04eb88dc
...
...
@@ -22,6 +22,25 @@ def rescale_image_size(image: Image.Image,
return
image
# TODO: Support customizable background color to fill in.
def
rgba_to_rgb
(
image
:
Image
.
Image
,
background_color
=
(
255
,
255
,
255
))
->
Image
.
Image
:
"""Convert an RGBA image to RGB with filled background color."""
assert
image
.
mode
==
"RGBA"
converted
=
Image
.
new
(
"RGB"
,
image
.
size
,
background_color
)
converted
.
paste
(
image
,
mask
=
image
.
split
()[
3
])
# 3 is the alpha channel
return
converted
def
convert_image_mode
(
image
:
Image
.
Image
,
to_mode
:
str
):
if
image
.
mode
==
to_mode
:
return
image
elif
image
.
mode
==
"RGBA"
and
to_mode
==
"RGB"
:
return
rgba_to_rgb
(
image
)
else
:
return
image
.
convert
(
to_mode
)
class
ImageMediaIO
(
MediaIO
[
Image
.
Image
]):
def
__init__
(
self
,
*
,
image_mode
:
str
=
"RGB"
)
->
None
:
...
...
@@ -32,7 +51,7 @@ class ImageMediaIO(MediaIO[Image.Image]):
def
load_bytes
(
self
,
data
:
bytes
)
->
Image
.
Image
:
image
=
Image
.
open
(
BytesIO
(
data
))
image
.
load
()
return
image
.
convert
(
self
.
image_mode
)
return
convert
_image_mode
(
image
,
self
.
image_mode
)
def
load_base64
(
self
,
media_type
:
str
,
data
:
str
)
->
Image
.
Image
:
return
self
.
load_bytes
(
base64
.
b64decode
(
data
))
...
...
@@ -40,7 +59,7 @@ class ImageMediaIO(MediaIO[Image.Image]):
def
load_file
(
self
,
filepath
:
Path
)
->
Image
.
Image
:
image
=
Image
.
open
(
filepath
)
image
.
load
()
return
image
.
convert
(
self
.
image_mode
)
return
convert
_image_mode
(
image
,
self
.
image_mode
)
def
encode_base64
(
self
,
...
...
@@ -51,7 +70,7 @@ class ImageMediaIO(MediaIO[Image.Image]):
image
=
media
with
BytesIO
()
as
buffer
:
image
=
image
.
convert
(
self
.
image_mode
)
image
=
convert
_image_mode
(
image
,
self
.
image_mode
)
image
.
save
(
buffer
,
image_format
)
data
=
buffer
.
getvalue
()
...
...
vllm/transformers_utils/processors/ovis.py
View file @
04eb88dc
...
...
@@ -33,6 +33,8 @@ from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
Unpack
)
from
transformers.tokenization_utils_base
import
PreTokenizedInput
,
TextInput
from
vllm.multimodal.image
import
convert_image_mode
__all__
=
[
'OvisProcessor'
]
IGNORE_ID
=
-
100
...
...
@@ -361,8 +363,8 @@ class OvisProcessor(ProcessorMixin):
# pick the partition with maximum covering_ratio and break the tie using #sub_images
return
sorted
(
all_grids
,
key
=
lambda
x
:
(
-
x
[
1
],
x
[
0
][
0
]
*
x
[
0
][
1
]))[
0
][
0
]
if
convert_to_rgb
and
image
.
mode
!=
'RGB'
:
image
=
image
.
convert
(
'RGB'
)
if
convert_to_rgb
:
image
=
convert
_image_mode
(
image
,
'RGB'
)
sides
=
self
.
get_image_size
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment