Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
34cda778
Unverified
Commit
34cda778
authored
Jul 16, 2025
by
Chauncey
Committed by
GitHub
Jul 15, 2025
Browse files
[Frontend] OpenAI Responses API supports input image (#20975)
Signed-off-by:
chaunceyjiang
<
chaunceyjiang@gmail.com
>
parent
30800b01
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
172 additions
and
3 deletions
+172
-3
tests/v1/entrypoints/openai/responses/test_image.py
tests/v1/entrypoints/openai/responses/test_image.py
+166
-0
vllm/entrypoints/chat_utils.py
vllm/entrypoints/chat_utils.py
+6
-3
No files found.
tests/v1/entrypoints/openai/responses/test_image.py
0 → 100644
View file @
34cda778
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
json
import
openai
import
pytest
import
pytest_asyncio
from
tests.utils
import
RemoteOpenAIServer
from
vllm.multimodal.utils
import
encode_image_base64
,
fetch_image
# Use a small vision model for testing
MODEL_NAME
=
"Qwen/Qwen2.5-VL-3B-Instruct"
MAXIMUM_IMAGES
=
2
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS
=
[
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
,
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png"
,
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png"
,
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png"
,
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
default_image_server_args
():
return
[
"--enforce-eager"
,
"--max-model-len"
,
"6000"
,
"--max-num-seqs"
,
"128"
,
"--limit-mm-per-prompt"
,
json
.
dumps
({
"image"
:
MAXIMUM_IMAGES
}),
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
image_server
(
default_image_server_args
):
with
RemoteOpenAIServer
(
MODEL_NAME
,
default_image_server_args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
client
(
image_server
):
async
with
image_server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
fixture
(
scope
=
"session"
)
def
base64_encoded_image
()
->
dict
[
str
,
str
]:
return
{
image_url
:
encode_image_base64
(
fetch_image
(
image_url
))
for
image_url
in
TEST_IMAGE_URLS
}
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_single_chat_session_image
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
):
content_text
=
"What's in this image?"
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"input_image"
,
"image_url"
:
image_url
,
"detail"
:
"auto"
,
},
{
"type"
:
"input_text"
,
"text"
:
content_text
},
],
}]
# test image url
response
=
await
client
.
responses
.
create
(
model
=
model_name
,
input
=
messages
,
)
assert
len
(
response
.
output_text
)
>
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_url"
,
TEST_IMAGE_URLS
)
async
def
test_single_chat_session_image_base64encoded
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_url
:
str
,
base64_encoded_image
:
dict
[
str
,
str
],
):
content_text
=
"What's in this image?"
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"input_image"
,
"image_url"
:
f
"data:image/jpeg;base64,
{
base64_encoded_image
[
image_url
]
}
"
,
"detail"
:
"auto"
,
},
{
"type"
:
"input_text"
,
"text"
:
content_text
},
],
}]
# test image base64
response
=
await
client
.
responses
.
create
(
model
=
model_name
,
input
=
messages
,
)
assert
len
(
response
.
output_text
)
>
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_urls"
,
[
TEST_IMAGE_URLS
[:
i
]
for
i
in
range
(
2
,
len
(
TEST_IMAGE_URLS
))])
async
def
test_multi_image_input
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_urls
:
list
[
str
]):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
*
({
"type"
:
"input_image"
,
"image_url"
:
image_url
,
"detail"
:
"auto"
,
}
for
image_url
in
image_urls
),
{
"type"
:
"input_text"
,
"text"
:
"What's in this image?"
},
],
}]
if
len
(
image_urls
)
>
MAXIMUM_IMAGES
:
with
pytest
.
raises
(
openai
.
BadRequestError
):
# test multi-image input
await
client
.
responses
.
create
(
model
=
model_name
,
input
=
messages
,
)
# the server should still work afterwards
response
=
await
client
.
responses
.
create
(
model
=
model_name
,
input
=
[{
"role"
:
"user"
,
"content"
:
"What's the weather like in Paris today?"
,
}],
)
assert
len
(
response
.
output_text
)
>
0
else
:
response
=
await
client
.
responses
.
create
(
model
=
model_name
,
input
=
messages
,
)
assert
len
(
response
.
output_text
)
>
0
vllm/entrypoints/chat_utils.py
View file @
34cda778
...
...
@@ -28,6 +28,7 @@ from openai.types.chat import (ChatCompletionMessageToolCallParam,
ChatCompletionToolMessageParam
)
from
openai.types.chat.chat_completion_content_part_input_audio_param
import
(
InputAudio
)
from
openai.types.responses
import
ResponseInputImageParam
from
PIL
import
Image
from
pydantic
import
BaseModel
,
ConfigDict
,
TypeAdapter
# yapf: enable
...
...
@@ -942,6 +943,8 @@ _ImageParser = TypeAdapter(ChatCompletionContentPartImageParam).validate_python
_AudioParser
=
TypeAdapter
(
ChatCompletionContentPartAudioParam
).
validate_python
_VideoParser
=
TypeAdapter
(
ChatCompletionContentPartVideoParam
).
validate_python
_ResponsesInputImageParser
=
TypeAdapter
(
ResponseInputImageParam
).
validate_python
_ContentPart
:
TypeAlias
=
Union
[
str
,
dict
[
str
,
str
],
InputAudio
,
PILImage
]
# Define a mapping from part types to their corresponding parsing functions.
...
...
@@ -953,6 +956,8 @@ MM_PARSER_MAP: dict[
lambda
part
:
_TextParser
(
part
).
get
(
"text"
,
None
),
"input_text"
:
lambda
part
:
_TextParser
(
part
).
get
(
"text"
,
None
),
"input_image"
:
lambda
part
:
_ResponsesInputImageParser
(
part
).
get
(
"image_url"
,
None
),
"image_url"
:
lambda
part
:
_ImageParser
(
part
).
get
(
"image_url"
,
{}).
get
(
"url"
,
None
),
"image_embeds"
:
...
...
@@ -1085,10 +1090,8 @@ def _parse_chat_message_content_part(
"""
if
isinstance
(
part
,
str
):
# Handle plain text parts
return
part
# Handle structured dictionary parts
part_type
,
content
=
_parse_chat_message_content_mm_part
(
part
)
# if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
# content is None, log a warning and skip
if
part_type
in
VALID_MESSAGE_CONTENT_MM_PART_TYPES
and
content
is
None
:
...
...
@@ -1109,7 +1112,7 @@ def _parse_chat_message_content_part(
image_content
=
cast
(
Image
.
Image
,
content
)
mm_parser
.
parse_image_pil
(
image_content
)
modality
=
"image"
elif
part_type
==
"image_url"
:
elif
part_type
in
(
"image_url"
,
"input_image"
)
:
str_content
=
cast
(
str
,
content
)
mm_parser
.
parse_image
(
str_content
)
modality
=
"image"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment