Add multimodal input method in the documentation (#31601)

Signed-off-by: xiaoming <1259730330@qq.com>

Add multimodal input method in the documentation (#31601)
Signed-off-by: xiaoming <1259730330@qq.com>
a01f2fae · labAxiaoming · GitHub · cc410e86 · a01f2fae · a01f2fae
Unverified Commit a01f2fae authored Jan 02, 2026 by labAxiaoming Committed by GitHub Jan 02, 2026
Showing with 96 additions and 0 deletions

docs/features/multimodal_inputs.md docs/features/multimodal_inputs.md +33 -0

examples/online_serving/openai_chat_completion_client_for_multimodal.py ...e_serving/openai_chat_completion_client_for_multimodal.py +63 -0

No files found.
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -506,6 +506,7 @@ Then, you can use the OpenAI client as follows:
 ??? code
    ```python
+    import os
    from openai import OpenAI
    openai_api_key = "EMPTY"
@@ -517,8 +518,11 @@ Then, you can use the OpenAI client as follows:
    )
    # Single-image input inference
+    # Public image URL for testing remote image processing
    image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    # Create chat completion with remote image
    chat_response = client.chat.completions.create(
        model="microsoft/Phi-3.5-vision-instruct",
        messages=[
@@ -542,6 +546,35 @@ Then, you can use the OpenAI client as follows:
    )
    print("Chat completion output:", chat_response.choices[0].message.content)
+    # Local image file path (update this to point to your actual image file)
+    image_file = "/path/to/image.jpg"
+    # Create chat completion with local image file
+    # Launch the API server/engine with the --allowed-local-media-path argument.
+    if os.path.exists(image_file):
+        chat_completion_from_local_image_url = client.chat.completions.create(
+            model="microsoft/Phi-3.5-vision-instruct",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "What’s in this image?",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"file://{image_file}"},
+                        },
+                    ],
+                }
+            ],
+        )
+        result = chat_completion_from_local_image_url.choices[0].message.content
+        print("Chat completion output from local image file:\n", result)
+    else:
+        print(f"Local image file not found at {image_file}, skipping local file test.")
    # Multi-image input inference
    image_url_duck = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg"
    image_url_lion = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg"

--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -21,6 +21,7 @@ python openai_chat_completion_client_for_multimodal.py --chat-type audio
 """
 import base64
+import os
 import requests
 from openai import OpenAI
@@ -51,6 +52,16 @@ def encode_base64_content_from_url(content_url: str) -> str:
    return result
+def encode_base64_content_from_file(file_path: str) -> str:
+    """Encode a local file content to base64 format."""
+    with open(file_path, "rb") as file:
+        file_content = file.read()
+        result = base64.b64encode(file_content).decode("utf-8")
+    return result
 # Text-only inference
 def run_text_only(model: str, max_completion_tokens: int) -> None:
    chat_completion = client.chat.completions.create(
@@ -67,6 +78,7 @@ def run_text_only(model: str, max_completion_tokens: int) -> None:
 def run_single_image(model: str, max_completion_tokens: int) -> None:
    ## Use image url in the payload
    image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    image_file = "/path/to/image.jpg"  # local file
    chat_completion_from_url = client.chat.completions.create(
        messages=[
            {
@@ -87,6 +99,30 @@ def run_single_image(model: str, max_completion_tokens: int) -> None:
    result = chat_completion_from_url.choices[0].message.content
    print("Chat completion output from image url:\n", result)
+    ## Use local image url in the payload
+    # Launch the API server/engine with the --allowed-local-media-path argument.
+    if os.path.exists(image_file):
+        chat_completion_from_local_image_url = client.chat.completions.create(
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What's in this image?"},
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"file://{image_file}"},
+                        },
+                    ],
+                }
+            ],
+            model=model,
+            max_completion_tokens=max_completion_tokens,
+        )
+        result = chat_completion_from_local_image_url.choices[0].message.content
+        print("Chat completion output from local image file:\n", result)
+    else:
+        print(f"Local image file not found at {image_file}, skipping local file test.")
    ## Use base64 encoded image in the payload
    image_base64 = encode_base64_content_from_url(image_url)
    chat_completion_from_base64 = client.chat.completions.create(
@@ -109,6 +145,33 @@ def run_single_image(model: str, max_completion_tokens: int) -> None:
    result = chat_completion_from_base64.choices[0].message.content
    print("Chat completion output from base64 encoded image:", result)
+    ## Use base64 encoded local image in the payload
+    if os.path.exists(image_file):
+        local_image_base64 = encode_base64_content_from_file(image_file)
+        chat_completion_from_local_image_base64 = client.chat.completions.create(
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What's in this image?"},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{local_image_base64}"
+                            },
+                        },
+                    ],
+                }
+            ],
+            model=model,
+            max_completion_tokens=max_completion_tokens,
+        )
+        result = chat_completion_from_local_image_base64.choices[0].message.content
+        print("Chat completion output from base64 encoded local image:", result)
+    else:
+        print(f"Local image file not found at {image_file}, skipping local file test.")
 # Multi-image input inference
 def run_multi_image(model: str, max_completion_tokens: int) -> None: