Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a01f2fae
Unverified
Commit
a01f2fae
authored
Jan 02, 2026
by
labAxiaoming
Committed by
GitHub
Jan 02, 2026
Browse files
Add multimodal input method in the documentation (#31601)
Signed-off-by:
xiaoming
<
1259730330@qq.com
>
parent
cc410e86
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
96 additions
and
0 deletions
+96
-0
docs/features/multimodal_inputs.md
docs/features/multimodal_inputs.md
+33
-0
examples/online_serving/openai_chat_completion_client_for_multimodal.py
...e_serving/openai_chat_completion_client_for_multimodal.py
+63
-0
No files found.
docs/features/multimodal_inputs.md
View file @
a01f2fae
...
@@ -506,6 +506,7 @@ Then, you can use the OpenAI client as follows:
...
@@ -506,6 +506,7 @@ Then, you can use the OpenAI client as follows:
??? code
??? code
```python
```python
import os
from openai import OpenAI
from openai import OpenAI
openai_api_key = "EMPTY"
openai_api_key = "EMPTY"
...
@@ -517,8 +518,11 @@ Then, you can use the OpenAI client as follows:
...
@@ -517,8 +518,11 @@ Then, you can use the OpenAI client as follows:
)
)
# Single-image input inference
# Single-image input inference
# Public image URL for testing remote image processing
image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
# Create chat completion with remote image
chat_response = client.chat.completions.create(
chat_response = client.chat.completions.create(
model="microsoft/Phi-3.5-vision-instruct",
model="microsoft/Phi-3.5-vision-instruct",
messages=[
messages=[
...
@@ -542,6 +546,35 @@ Then, you can use the OpenAI client as follows:
...
@@ -542,6 +546,35 @@ Then, you can use the OpenAI client as follows:
)
)
print("Chat completion output:", chat_response.choices[0].message.content)
print("Chat completion output:", chat_response.choices[0].message.content)
# Local image file path (update this to point to your actual image file)
image_file = "/path/to/image.jpg"
# Create chat completion with local image file
# Launch the API server/engine with the --allowed-local-media-path argument.
if os.path.exists(image_file):
chat_completion_from_local_image_url = client.chat.completions.create(
model="microsoft/Phi-3.5-vision-instruct",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "What’s in this image?",
},
{
"type": "image_url",
"image_url": {"url": f"file://{image_file}"},
},
],
}
],
)
result = chat_completion_from_local_image_url.choices[0].message.content
print("Chat completion output from local image file:\n", result)
else:
print(f"Local image file not found at {image_file}, skipping local file test.")
# Multi-image input inference
# Multi-image input inference
image_url_duck = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg"
image_url_duck = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg"
image_url_lion = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg"
image_url_lion = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg"
...
...
examples/online_serving/openai_chat_completion_client_for_multimodal.py
View file @
a01f2fae
...
@@ -21,6 +21,7 @@ python openai_chat_completion_client_for_multimodal.py --chat-type audio
...
@@ -21,6 +21,7 @@ python openai_chat_completion_client_for_multimodal.py --chat-type audio
"""
"""
import
base64
import
base64
import
os
import
requests
import
requests
from
openai
import
OpenAI
from
openai
import
OpenAI
...
@@ -51,6 +52,16 @@ def encode_base64_content_from_url(content_url: str) -> str:
...
@@ -51,6 +52,16 @@ def encode_base64_content_from_url(content_url: str) -> str:
return
result
return
result
def
encode_base64_content_from_file
(
file_path
:
str
)
->
str
:
"""Encode a local file content to base64 format."""
with
open
(
file_path
,
"rb"
)
as
file
:
file_content
=
file
.
read
()
result
=
base64
.
b64encode
(
file_content
).
decode
(
"utf-8"
)
return
result
# Text-only inference
# Text-only inference
def
run_text_only
(
model
:
str
,
max_completion_tokens
:
int
)
->
None
:
def
run_text_only
(
model
:
str
,
max_completion_tokens
:
int
)
->
None
:
chat_completion
=
client
.
chat
.
completions
.
create
(
chat_completion
=
client
.
chat
.
completions
.
create
(
...
@@ -67,6 +78,7 @@ def run_text_only(model: str, max_completion_tokens: int) -> None:
...
@@ -67,6 +78,7 @@ def run_text_only(model: str, max_completion_tokens: int) -> None:
def
run_single_image
(
model
:
str
,
max_completion_tokens
:
int
)
->
None
:
def
run_single_image
(
model
:
str
,
max_completion_tokens
:
int
)
->
None
:
## Use image url in the payload
## Use image url in the payload
image_url
=
"https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
image_url
=
"https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
image_file
=
"/path/to/image.jpg"
# local file
chat_completion_from_url
=
client
.
chat
.
completions
.
create
(
chat_completion_from_url
=
client
.
chat
.
completions
.
create
(
messages
=
[
messages
=
[
{
{
...
@@ -87,6 +99,30 @@ def run_single_image(model: str, max_completion_tokens: int) -> None:
...
@@ -87,6 +99,30 @@ def run_single_image(model: str, max_completion_tokens: int) -> None:
result
=
chat_completion_from_url
.
choices
[
0
].
message
.
content
result
=
chat_completion_from_url
.
choices
[
0
].
message
.
content
print
(
"Chat completion output from image url:
\n
"
,
result
)
print
(
"Chat completion output from image url:
\n
"
,
result
)
## Use local image url in the payload
# Launch the API server/engine with the --allowed-local-media-path argument.
if
os
.
path
.
exists
(
image_file
):
chat_completion_from_local_image_url
=
client
.
chat
.
completions
.
create
(
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
f
"file://
{
image_file
}
"
},
},
],
}
],
model
=
model
,
max_completion_tokens
=
max_completion_tokens
,
)
result
=
chat_completion_from_local_image_url
.
choices
[
0
].
message
.
content
print
(
"Chat completion output from local image file:
\n
"
,
result
)
else
:
print
(
f
"Local image file not found at
{
image_file
}
, skipping local file test."
)
## Use base64 encoded image in the payload
## Use base64 encoded image in the payload
image_base64
=
encode_base64_content_from_url
(
image_url
)
image_base64
=
encode_base64_content_from_url
(
image_url
)
chat_completion_from_base64
=
client
.
chat
.
completions
.
create
(
chat_completion_from_base64
=
client
.
chat
.
completions
.
create
(
...
@@ -109,6 +145,33 @@ def run_single_image(model: str, max_completion_tokens: int) -> None:
...
@@ -109,6 +145,33 @@ def run_single_image(model: str, max_completion_tokens: int) -> None:
result
=
chat_completion_from_base64
.
choices
[
0
].
message
.
content
result
=
chat_completion_from_base64
.
choices
[
0
].
message
.
content
print
(
"Chat completion output from base64 encoded image:"
,
result
)
print
(
"Chat completion output from base64 encoded image:"
,
result
)
## Use base64 encoded local image in the payload
if
os
.
path
.
exists
(
image_file
):
local_image_base64
=
encode_base64_content_from_file
(
image_file
)
chat_completion_from_local_image_base64
=
client
.
chat
.
completions
.
create
(
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
f
"data:image/jpeg;base64,
{
local_image_base64
}
"
},
},
],
}
],
model
=
model
,
max_completion_tokens
=
max_completion_tokens
,
)
result
=
chat_completion_from_local_image_base64
.
choices
[
0
].
message
.
content
print
(
"Chat completion output from base64 encoded local image:"
,
result
)
else
:
print
(
f
"Local image file not found at
{
image_file
}
, skipping local file test."
)
# Multi-image input inference
# Multi-image input inference
def
run_multi_image
(
model
:
str
,
max_completion_tokens
:
int
)
->
None
:
def
run_multi_image
(
model
:
str
,
max_completion_tokens
:
int
)
->
None
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment