Commit c1cacde6 authored by weishb's avatar weishb
Browse files

vllm-omni_0.15.0.rc1+fix1 first commit

parent 35607782
# Qwen3-Omni
## 🛠️ Installation
Please refer to [README.md](../../../README.md)
## Run examples (Qwen3-Omni)
### Launch the Server
```bash
vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091
```
If you want to open async chunking for qwen3-omni, launch the server with command below
```bash
vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path /vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml
```
If you have custom stage configs file, launch the server with command below
```bash
vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path /path/to/stage_configs_file
```
### Send Multi-modal Request
Get into the example folder
```bash
cd examples/online_serving/qwen3_omni
```
#### Send request via python
```bash
python openai_chat_completion_client_for_multimodal_generation.py --query-type use_image
```
The Python client supports the following command-line arguments:
- `--query-type` (or `-q`): Query type (default: `use_video`). Options: `text`, `use_audio`, `use_image`, `use_video`
- `--model` (or `-m`): Model name/path (default: `Qwen/Qwen3-Omni-30B-A3B-Instruct`)
- `--video-path` (or `-v`): Path to local video file or URL. If not provided and query-type is `use_video`, uses default video URL. Supports local file paths (automatically encoded to base64) or HTTP/HTTPS URLs. Example: `--video-path /path/to/video.mp4` or `--video-path https://example.com/video.mp4`
- `--image-path` (or `-i`): Path to local image file or URL. If not provided and query-type is `use_image`, uses default image URL. Supports local file paths (automatically encoded to base64) or HTTP/HTTPS URLs and common image formats: JPEG, PNG, GIF, WebP. Example: `--image-path /path/to/image.jpg` or `--image-path https://example.com/image.png`
- `--audio-path` (or `-a`): Path to local audio file or URL. If not provided and query-type is `use_audio`, uses default audio URL. Supports local file paths (automatically encoded to base64) or HTTP/HTTPS URLs and common audio formats: MP3, WAV, OGG, FLAC, M4A. Example: `--audio-path /path/to/audio.wav` or `--audio-path https://example.com/audio.mp3`
- `--prompt` (or `-p`): Custom text prompt/question. If not provided, uses default prompt for the selected query type. Example: `--prompt "What are the main activities shown in this video?"`
For example, to use a local video file with custom prompt:
```bash
python openai_chat_completion_client_for_multimodal_generation.py \
--query-type use_video \
--video-path /path/to/your/video.mp4 \
--prompt "What are the main activities shown in this video?"
```
#### Send request via curl
```bash
bash run_curl_multimodal_generation.sh use_image
```
### FAQ
If you encounter error about backend of librosa, try to install ffmpeg with command below.
```
sudo apt update
sudo apt install ffmpeg
```
## Modality control
You can control output modalities to specify which types of output the model should generate. This is useful when you only need text output and want to skip audio generation stages for better performance.
### Supported modalities
| Modalities | Output |
|------------|--------|
| `["text"]` | Text only |
| `["audio"]` | Text + Audio |
| `["text", "audio"]` | Text + Audio |
| Not specified | Text + Audio (default) |
### Using curl
#### Text only
```bash
curl http://localhost:8091/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
"messages": [{"role": "user", "content": "Describe vLLM in brief."}],
"modalities": ["text"]
}'
```
#### Text + Audio
```bash
curl http://localhost:8091/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
"messages": [{"role": "user", "content": "Describe vLLM in brief."}],
"modalities": ["audio"]
}'
```
### Using Python client
```bash
python openai_chat_completion_client_for_multimodal_generation.py \
--query-type use_image \
--modalities text
```
### Using OpenAI Python SDK
#### Text only
```python
from openai import OpenAI
client = OpenAI(base_url="http://localhost:8091/v1", api_key="EMPTY")
response = client.chat.completions.create(
model="Qwen/Qwen3-Omni-30B-A3B-Instruct",
messages=[{"role": "user", "content": "Describe vLLM in brief."}],
modalities=["text"]
)
print(response.choices[0].message.content)
```
#### Text + Audio
```python
from openai import OpenAI
client = OpenAI(base_url="http://localhost:8091/v1", api_key="EMPTY")
response = client.chat.completions.create(
model="Qwen/Qwen3-Omni-30B-A3B-Instruct",
messages=[{"role": "user", "content": "Describe vLLM in brief."}],
modalities=["audio"]
)
# Response contains two choices: one with text, one with audio
print(response.choices[0].message.content) # Text response
print(response.choices[1].message.audio) # Audio response
```
## Streaming Output
If you want to enable streaming output, please set the argument as below. The final output will be obtained just after generated by corresponding stage. Now we only support text streaming output. Other modalities can output normally.
```bash
python openai_chat_completion_client_for_multimodal_generation.py \
--query-type use_image \
--stream
```
## Run Local Web UI Demo
This Web UI demo allows users to interact with the model through a web browser.
### Running Gradio Demo
The Gradio demo connects to a vLLM API server. You have two options:
#### Option 1: One-step Launch Script (Recommended)
The convenience script launches both the vLLM server and Gradio demo together:
```bash
./run_gradio_demo.sh --model Qwen/Qwen3-Omni-30B-A3B-Instruct --server-port 8091 --gradio-port 7861
```
This script will:
1. Start the vLLM server in the background
2. Wait for the server to be ready
3. Launch the Gradio demo
4. Handle cleanup when you press Ctrl+C
The script supports the following arguments:
- `--model`: Model name/path (default: Qwen/Qwen3-Omni-30B-A3B-Instruct)
- `--server-port`: Port for vLLM server (default: 8091)
- `--gradio-port`: Port for Gradio demo (default: 7861)
- `--stage-configs-path`: Path to custom stage configs YAML file (optional)
- `--server-host`: Host for vLLM server (default: 0.0.0.0)
- `--gradio-ip`: IP for Gradio demo (default: 127.0.0.1)
- `--share`: Share Gradio demo publicly (creates a public link)
#### Option 2: Manual Launch (Two-Step Process)
**Step 1: Launch the vLLM API server**
```bash
vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091
```
If you have custom stage configs file:
```bash
vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path /path/to/stage_configs_file
```
**Step 2: Run the Gradio demo**
In a separate terminal:
```bash
python gradio_demo.py --model Qwen/Qwen3-Omni-30B-A3B-Instruct --api-base http://localhost:8091/v1 --port 7861
```
Then open `http://localhost:7861/` on your local browser to interact with the web UI.
The gradio script supports the following arguments:
- `--model`: Model name/path (should match the server model)
- `--api-base`: Base URL for the vLLM API server (default: http://localhost:8091/v1)
- `--ip`: Host/IP for Gradio server (default: 127.0.0.1)
- `--port`: Port for Gradio server (default: 7861)
- `--share`: Share the Gradio demo publicly (creates a public link)
import argparse
import base64
import io
import os
import random
from pathlib import Path
from typing import Any
import gradio as gr
import numpy as np
import soundfile as sf
import torch
from openai import OpenAI
from PIL import Image
SEED = 42
SUPPORTED_MODELS: dict[str, dict[str, Any]] = {
"Qwen/Qwen3-Omni-30B-A3B-Instruct": {
"sampling_params": {
"thinker": {
"temperature": 0.4,
"top_p": 0.9,
"top_k": 1,
"max_tokens": 16384,
"detokenize": True,
"repetition_penalty": 1.05,
"stop_token_ids": [151645],
"seed": SEED,
},
"talker": {
"temperature": 0.9,
"top_k": 50,
"max_tokens": 4096,
"seed": SEED,
"detokenize": False,
"repetition_penalty": 1.05,
"stop_token_ids": [2150],
},
"code2wav": {
"temperature": 0.0,
"top_p": 1.0,
"top_k": -1,
"max_tokens": 4096 * 16,
"seed": SEED,
"detokenize": True,
"repetition_penalty": 1.1,
},
},
},
}
# Ensure deterministic behavior across runs.
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
os.environ["PYTHONHASHSEED"] = str(SEED)
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
def parse_args():
parser = argparse.ArgumentParser(description="Gradio demo for Qwen3-Omni online inference.")
parser.add_argument(
"--model",
default="Qwen/Qwen3-Omni-30B-A3B-Instruct",
help="Model name/path (should match the server model).",
)
parser.add_argument(
"--api-base",
default="http://localhost:8091/v1",
help="Base URL for the vLLM API server.",
)
parser.add_argument(
"--ip",
default="127.0.0.1",
help="Host/IP for gradio `launch`.",
)
parser.add_argument("--port", type=int, default=7861, help="Port for gradio `launch`.")
parser.add_argument("--share", action="store_true", help="Share the Gradio demo publicly.")
return parser.parse_args()
def build_sampling_params_dict(seed: int, model_key: str) -> list[dict]:
"""Build sampling params as dict for HTTP API mode."""
model_conf = SUPPORTED_MODELS.get(model_key)
if model_conf is None:
raise ValueError(f"Unsupported model '{model_key}'")
sampling_templates: dict[str, dict[str, Any]] = model_conf["sampling_params"]
sampling_params: list[dict] = []
for stage_name, template in sampling_templates.items():
params = dict(template)
params["seed"] = seed
sampling_params.append(params)
return sampling_params
def image_to_base64_data_url(image: Image.Image) -> str:
"""Convert PIL Image to base64 data URL."""
buffered = io.BytesIO()
# Convert to RGB if needed
if image.mode != "RGB":
image = image.convert("RGB")
image.save(buffered, format="JPEG")
img_bytes = buffered.getvalue()
img_b64 = base64.b64encode(img_bytes).decode("utf-8")
return f"data:image/jpeg;base64,{img_b64}"
def audio_to_base64_data_url(audio_data: tuple[np.ndarray, int]) -> str:
"""Convert audio (numpy array, sample_rate) to base64 data URL."""
audio_np, sample_rate = audio_data
# Convert to int16 format for WAV
if audio_np.dtype != np.int16:
# Normalize to [-1, 1] range if needed
if audio_np.dtype == np.float32 or audio_np.dtype == np.float64:
audio_np = np.clip(audio_np, -1.0, 1.0)
audio_np = (audio_np * 32767).astype(np.int16)
else:
audio_np = audio_np.astype(np.int16)
# Write to WAV bytes
buffered = io.BytesIO()
sf.write(buffered, audio_np, sample_rate, format="WAV")
wav_bytes = buffered.getvalue()
wav_b64 = base64.b64encode(wav_bytes).decode("utf-8")
return f"data:audio/wav;base64,{wav_b64}"
def video_to_base64_data_url(video_file: str) -> str:
"""Convert video file to base64 data URL."""
video_path = Path(video_file)
if not video_path.exists():
raise FileNotFoundError(f"Video file not found: {video_file}")
# Detect MIME type from extension
video_path_lower = str(video_path).lower()
if video_path_lower.endswith(".mp4"):
mime_type = "video/mp4"
elif video_path_lower.endswith(".webm"):
mime_type = "video/webm"
elif video_path_lower.endswith(".mov"):
mime_type = "video/quicktime"
elif video_path_lower.endswith(".avi"):
mime_type = "video/x-msvideo"
elif video_path_lower.endswith(".mkv"):
mime_type = "video/x-matroska"
else:
mime_type = "video/mp4"
with open(video_path, "rb") as f:
video_bytes = f.read()
video_b64 = base64.b64encode(video_bytes).decode("utf-8")
return f"data:{mime_type};base64,{video_b64}"
def process_audio_file(
audio_file: Any | None,
) -> tuple[np.ndarray, int] | None:
"""Normalize Gradio audio input to (np.ndarray, sample_rate)."""
if audio_file is None:
return None
sample_rate: int | None = None
audio_np: np.ndarray | None = None
def _load_from_path(path_str: str) -> tuple[np.ndarray, int] | None:
if not path_str:
return None
path = Path(path_str)
if not path.exists():
return None
data, sr = sf.read(path)
if data.ndim > 1:
data = data[:, 0]
return data.astype(np.float32), int(sr)
if isinstance(audio_file, tuple):
if len(audio_file) == 2:
first, second = audio_file
# Case 1: (sample_rate, np.ndarray)
if isinstance(first, (int, float)) and isinstance(second, np.ndarray):
sample_rate = int(first)
audio_np = second
# Case 2: (filepath, (sample_rate, np.ndarray or list))
elif isinstance(first, str):
if isinstance(second, tuple) and len(second) == 2:
sr_candidate, data_candidate = second
if isinstance(sr_candidate, (int, float)) and isinstance(data_candidate, np.ndarray):
sample_rate = int(sr_candidate)
audio_np = data_candidate
if audio_np is None:
loaded = _load_from_path(first)
if loaded is not None:
audio_np, sample_rate = loaded
# Case 3: (None, (sample_rate, np.ndarray))
elif first is None and isinstance(second, tuple) and len(second) == 2:
sr_candidate, data_candidate = second
if isinstance(sr_candidate, (int, float)) and isinstance(data_candidate, np.ndarray):
sample_rate = int(sr_candidate)
audio_np = data_candidate
elif len(audio_file) == 1 and isinstance(audio_file[0], str):
loaded = _load_from_path(audio_file[0])
if loaded is not None:
audio_np, sample_rate = loaded
elif isinstance(audio_file, str):
loaded = _load_from_path(audio_file)
if loaded is not None:
audio_np, sample_rate = loaded
if audio_np is None or sample_rate is None:
return None
if audio_np.ndim > 1:
audio_np = audio_np[:, 0]
return audio_np.astype(np.float32), sample_rate
def process_image_file(image_file: Image.Image | None) -> Image.Image | None:
"""Process image file from Gradio input.
Returns:
PIL Image in RGB mode or None if no image provided.
"""
if image_file is None:
return None
# Convert to RGB if needed
if image_file.mode != "RGB":
image_file = image_file.convert("RGB")
return image_file
def run_inference_api(
client: OpenAI,
model: str,
sampling_params_dict: list[dict],
user_prompt: str,
audio_file: tuple[str, tuple[int, np.ndarray]] | None = None,
image_file: Image.Image | None = None,
video_file: str | None = None,
use_audio_in_video: bool = False,
output_modalities: str | None = None,
stream: bool = False,
):
"""Run inference using OpenAI API client with multimodal support."""
if not user_prompt.strip() and not audio_file and not image_file and not video_file:
yield "Please provide at least a text prompt or multimodal input.", None
try:
# Build message content list
content_list = []
# Process audio
audio_data = process_audio_file(audio_file)
if audio_data is not None:
audio_url = audio_to_base64_data_url(audio_data)
content_list.append(
{
"type": "audio_url",
"audio_url": {"url": audio_url},
}
)
# Process image
if image_file is not None:
image_data = process_image_file(image_file)
if image_data is not None:
image_url = image_to_base64_data_url(image_data)
content_list.append(
{
"type": "image_url",
"image_url": {"url": image_url},
}
)
# Process video
mm_processor_kwargs = {}
if video_file is not None:
video_url = video_to_base64_data_url(video_file)
video_content = {
"type": "video_url",
"video_url": {"url": video_url},
}
if use_audio_in_video:
video_content["video_url"]["num_frames"] = 32 # Default max frames
mm_processor_kwargs["use_audio_in_video"] = True
content_list.append(video_content)
# Add text prompt
if user_prompt.strip():
content_list.append(
{
"type": "text",
"text": user_prompt,
}
)
# Build messages
messages = [
{
"role": "system",
"content": [
{
"type": "text",
"text": (
"You are Qwen, a virtual human developed by the Qwen Team, "
"Alibaba Group, capable of perceiving auditory and visual inputs, "
"as well as generating text and speech."
),
}
],
},
{
"role": "user",
"content": content_list,
},
]
# Build extra_body
extra_body = {
"sampling_params_list": sampling_params_dict,
}
if mm_processor_kwargs:
extra_body["mm_processor_kwargs"] = mm_processor_kwargs
# Parse output modalities
if output_modalities and output_modalities.strip():
output_modalities_list = [m.strip() for m in output_modalities.split(",")]
else:
output_modalities_list = None
# Call API
chat_completion = client.chat.completions.create(
messages=messages,
model=model,
modalities=output_modalities_list,
extra_body=extra_body,
stream=stream,
)
if not stream:
# Non-streaming mode: extract outputs and yield once
text_outputs: list[str] = []
audio_output = None
for choice in chat_completion.choices:
if choice.message.content:
text_outputs.append(choice.message.content)
if choice.message.audio:
# Decode base64 audio
audio_data = base64.b64decode(choice.message.audio.data)
# Load audio from bytes
audio_np, sample_rate = sf.read(io.BytesIO(audio_data))
# Convert to mono if needed
if audio_np.ndim > 1:
audio_np = audio_np[:, 0]
audio_output = (int(sample_rate), audio_np.astype(np.float32))
text_response = "\n\n".join(text_outputs) if text_outputs else "No text output."
yield text_response, audio_output
else:
# Streaming mode: yield incremental updates
text_content = ""
audio_output = None
for chunk in chat_completion:
for choice in chunk.choices:
if hasattr(choice, "delta"):
content = getattr(choice.delta, "content", None)
else:
content = None
# Handle audio modality
if getattr(chunk, "modality", None) == "audio" and content:
try:
# Decode base64 audio
audio_data = base64.b64decode(content)
# Load audio from bytes
audio_np, sample_rate = sf.read(io.BytesIO(audio_data))
# Convert to mono if needed
if audio_np.ndim > 1:
audio_np = audio_np[:, 0]
audio_output = (int(sample_rate), audio_np.astype(np.float32))
# Yield current text and audio
yield text_content if text_content else "", audio_output
except Exception: # pylint: disable=broad-except
# If audio processing fails, just yield text
yield text_content if text_content else "", None
# Handle text modality
elif getattr(chunk, "modality", None) == "text":
if content:
text_content += content
# Yield updated text content (keep existing audio if any)
yield text_content, audio_output
# Final yield with accumulated text and last audio (if any)
yield text_content if text_content else "No text output.", audio_output
except Exception as exc: # pylint: disable=broad-except
error_msg = f"Inference failed: {exc}"
yield error_msg, None
def build_interface(
client: OpenAI,
model: str,
sampling_params_dict: list[dict],
):
"""Build Gradio interface for API server mode."""
def run_inference(
user_prompt: str,
audio_file: tuple[str, tuple[int, np.ndarray]] | None,
image_file: Image.Image | None,
video_file: str | None,
use_audio_in_video: bool,
output_modalities: str | None = None,
stream: bool = False,
):
# Always yield from the API function to maintain consistent generator behavior
yield from run_inference_api(
client,
model,
sampling_params_dict,
user_prompt,
audio_file,
image_file,
video_file,
use_audio_in_video,
output_modalities,
stream,
)
css = """
.media-input-container {
display: flex;
gap: 10px;
}
.media-input-container > div {
flex: 1;
}
.media-input-container .image-input,
.media-input-container .audio-input {
height: 300px;
}
.media-input-container .video-column {
height: 300px;
display: flex;
flex-direction: column;
}
.media-input-container .video-input {
flex: 1;
min-height: 0;
}
#generate-btn button {
width: 100%;
}
"""
with gr.Blocks(css=css) as demo:
gr.Markdown("# vLLM-Omni Online Serving Demo")
gr.Markdown(f"**Model:** {model} \n\n")
with gr.Column():
with gr.Row():
input_box = gr.Textbox(
label="Text Prompt",
placeholder="For example: Describe what happens in the media inputs.",
lines=4,
scale=1,
)
with gr.Row(elem_classes="media-input-container"):
image_input = gr.Image(
label="Image Input (optional)",
type="pil",
sources=["upload"],
scale=1,
elem_classes="image-input",
)
with gr.Column(scale=1, elem_classes="video-column"):
video_input = gr.Video(
label="Video Input (optional)",
sources=["upload"],
elem_classes="video-input",
)
use_audio_in_video_checkbox = gr.Checkbox(
label="Use audio from video",
value=False,
info="Extract the video's audio track when provided.",
)
audio_input = gr.Audio(
label="Audio Input (optional)",
type="numpy",
sources=["upload", "microphone"],
scale=1,
elem_classes="audio-input",
)
with gr.Row():
output_modalities = gr.Textbox(
label="Output Modalities",
value=None,
placeholder="For example: text, image, video. Use comma to separate multiple modalities.",
lines=1,
scale=2,
)
stream_checkbox = gr.Checkbox(
label="Stream output",
value=False,
info="Enable streaming to see output as it's generated.",
scale=1,
)
with gr.Row():
generate_btn = gr.Button(
"Generate",
variant="primary",
size="lg",
elem_id="generate-btn",
)
with gr.Row():
text_output = gr.Textbox(label="Text Output", lines=10, scale=2)
audio_output = gr.Audio(label="Audio Output", interactive=False, scale=1)
generate_btn.click(
fn=run_inference,
inputs=[
input_box,
audio_input,
image_input,
video_input,
use_audio_in_video_checkbox,
output_modalities,
stream_checkbox,
],
outputs=[text_output, audio_output],
)
demo.queue()
return demo
def main():
args = parse_args()
model_name = "/".join(args.model.split("/")[-2:])
assert model_name in SUPPORTED_MODELS, (
f"Unsupported model '{model_name}'. Supported models: {SUPPORTED_MODELS.keys()}"
)
# Initialize OpenAI client
print(f"Connecting to API server at: {args.api_base}")
client = OpenAI(
api_key="EMPTY",
base_url=args.api_base,
)
print("✓ Connected to API server")
# Build sampling params
sampling_params_dict = build_sampling_params_dict(SEED, model_name)
demo = build_interface(
client,
args.model,
sampling_params_dict,
)
try:
demo.launch(
server_name=args.ip,
server_port=args.port,
share=args.share,
)
except KeyboardInterrupt:
print("\nShutting down...")
if __name__ == "__main__":
main()
import base64
import concurrent.futures
import os
from typing import NamedTuple
import requests
from openai import OpenAI
from vllm.assets.audio import AudioAsset
from vllm.utils.argparse_utils import FlexibleArgumentParser
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8091/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
SEED = 42
class QueryResult(NamedTuple):
inputs: dict
limit_mm_per_prompt: dict[str, int]
def encode_base64_content_from_url(content_url: str) -> str:
"""Encode a content retrieved from a remote url to base64 format."""
with requests.get(content_url) as response:
response.raise_for_status()
result = base64.b64encode(response.content).decode("utf-8")
return result
def encode_base64_content_from_file(file_path: str) -> str:
"""Encode a local file to base64 format."""
with open(file_path, "rb") as f:
content = f.read()
result = base64.b64encode(content).decode("utf-8")
return result
def get_video_url_from_path(video_path: str | None) -> str:
"""Convert a video path (local file or URL) to a video URL format for the API.
If video_path is None or empty, returns the default URL.
If video_path is a local file path, encodes it to base64 data URL.
If video_path is a URL, returns it as-is.
"""
if not video_path:
# Default video URL
return "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4"
# Check if it's a URL (starts with http:// or https://)
if video_path.startswith(("http://", "https://")):
return video_path
# Otherwise, treat it as a local file path
if not os.path.exists(video_path):
raise FileNotFoundError(f"Video file not found: {video_path}")
# Detect video MIME type from file extension
video_path_lower = video_path.lower()
if video_path_lower.endswith(".mp4"):
mime_type = "video/mp4"
elif video_path_lower.endswith(".webm"):
mime_type = "video/webm"
elif video_path_lower.endswith(".mov"):
mime_type = "video/quicktime"
elif video_path_lower.endswith(".avi"):
mime_type = "video/x-msvideo"
elif video_path_lower.endswith(".mkv"):
mime_type = "video/x-matroska"
else:
# Default to mp4 if extension is unknown
mime_type = "video/mp4"
video_base64 = encode_base64_content_from_file(video_path)
return f"data:{mime_type};base64,{video_base64}"
def get_image_url_from_path(image_path: str | None) -> str:
"""Convert an image path (local file or URL) to an image URL format for the API.
If image_path is None or empty, returns the default URL.
If image_path is a local file path, encodes it to base64 data URL.
If image_path is a URL, returns it as-is.
"""
if not image_path:
# Default image URL
return "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/cherry_blossom.jpg"
# Check if it's a URL (starts with http:// or https://)
if image_path.startswith(("http://", "https://")):
return image_path
# Otherwise, treat it as a local file path
if not os.path.exists(image_path):
raise FileNotFoundError(f"Image file not found: {image_path}")
# Detect image MIME type from file extension
image_path_lower = image_path.lower()
if image_path_lower.endswith((".jpg", ".jpeg")):
mime_type = "image/jpeg"
elif image_path_lower.endswith(".png"):
mime_type = "image/png"
elif image_path_lower.endswith(".gif"):
mime_type = "image/gif"
elif image_path_lower.endswith(".webp"):
mime_type = "image/webp"
else:
# Default to jpeg if extension is unknown
mime_type = "image/jpeg"
image_base64 = encode_base64_content_from_file(image_path)
return f"data:{mime_type};base64,{image_base64}"
def get_audio_url_from_path(audio_path: str | None) -> str:
"""Convert an audio path (local file or URL) to an audio URL format for the API.
If audio_path is None or empty, returns the default URL.
If audio_path is a local file path, encodes it to base64 data URL.
If audio_path is a URL, returns it as-is.
"""
if not audio_path:
# Default audio URL
return AudioAsset("mary_had_lamb").url
# Check if it's a URL (starts with http:// or https://)
if audio_path.startswith(("http://", "https://")):
return audio_path
# Otherwise, treat it as a local file path
if not os.path.exists(audio_path):
raise FileNotFoundError(f"Audio file not found: {audio_path}")
# Detect audio MIME type from file extension
audio_path_lower = audio_path.lower()
if audio_path_lower.endswith((".mp3", ".mpeg")):
mime_type = "audio/mpeg"
elif audio_path_lower.endswith(".wav"):
mime_type = "audio/wav"
elif audio_path_lower.endswith(".ogg"):
mime_type = "audio/ogg"
elif audio_path_lower.endswith(".flac"):
mime_type = "audio/flac"
elif audio_path_lower.endswith(".m4a"):
mime_type = "audio/mp4"
else:
# Default to wav if extension is unknown
mime_type = "audio/wav"
audio_base64 = encode_base64_content_from_file(audio_path)
return f"data:{mime_type};base64,{audio_base64}"
def get_system_prompt():
return {
"role": "system",
"content": [
{
"type": "text",
"text": (
"You are Qwen, a virtual human developed by the Qwen Team, "
"Alibaba Group, capable of perceiving auditory and visual inputs, "
"as well as generating text and speech."
),
}
],
}
def get_text_query(custom_prompt: str | None = None):
question = (
custom_prompt or "Explain the system architecture for a scalable audio generation pipeline. Answer in 15 words."
)
prompt = {
"role": "user",
"content": [
{
"type": "text",
"text": f"{question}",
}
],
}
return prompt
default_system = (
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
"Group, capable of perceiving auditory and visual inputs, as well as "
"generating text and speech."
)
def get_video_query(video_path: str | None = None, custom_prompt: str | None = None):
question = custom_prompt or "Why is this video funny?"
video_url = get_video_url_from_path(video_path)
prompt = {
"role": "user",
"content": [
{
"type": "video_url",
"video_url": {"url": video_url},
},
{
"type": "text",
"text": f"{question}",
},
],
}
return prompt
def get_image_query(image_path: str | None = None, custom_prompt: str | None = None):
question = custom_prompt or "What is the content of this image?"
image_url = get_image_url_from_path(image_path)
prompt = {
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": image_url},
},
{
"type": "text",
"text": f"{question}",
},
],
}
return prompt
def get_audio_query(audio_path: str | None = None, custom_prompt: str | None = None):
question = custom_prompt or "What is the content of this audio?"
audio_url = get_audio_url_from_path(audio_path)
prompt = {
"role": "user",
"content": [
{
"type": "audio_url",
"audio_url": {"url": audio_url},
},
{
"type": "text",
"text": f"{question}",
},
],
}
return prompt
def get_mixed_modalities_query(
video_path: str | None = None,
image_path: str | None = None,
audio_path: str | None = None,
custom_prompt: str | None = None,
):
"""
Online-friendly multimodal user message:
- Uses URLs (or base64 data URLs) for audio / image / video.
- Returns the OpenAI-style message dict directly (not the offline QueryResult).
"""
question = (
custom_prompt or "What is recited in the audio? What is the content of this image? Why is this video funny?"
)
audio_url = get_audio_url_from_path(audio_path)
image_url = get_image_url_from_path(image_path)
video_url = get_video_url_from_path(video_path)
return {
"role": "user",
"content": [
{"type": "audio_url", "audio_url": {"url": audio_url}},
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "video_url", "video_url": {"url": video_url}},
{"type": "text", "text": question},
],
}
def get_multi_audios_query(custom_prompt: str | None = None):
"""
Online-friendly two-audio comparison request.
- Encodes both audio clips as URLs (or data URLs).
- Returns the OpenAI-style message dict.
"""
question = custom_prompt or "Are these two audio clips the same?"
# Use default demo clips; you can point to your own via --audio-path if needed.
audio_url_1 = get_audio_url_from_path(AudioAsset("winning_call").url)
audio_url_2 = get_audio_url_from_path(AudioAsset("mary_had_lamb").url)
return {
"role": "user",
"content": [
{"type": "audio_url", "audio_url": {"url": audio_url_1}},
{"type": "audio_url", "audio_url": {"url": audio_url_2}},
{"type": "text", "text": question},
],
}
def get_use_audio_in_video_query(
video_path: str | None = None,
audio_path: str | None = None,
custom_prompt: str | None = None,
):
question = custom_prompt or (
"Describe the content of the video in details, then convert what the baby say into text."
)
video_url = get_video_url_from_path(video_path)
audio_url = get_audio_url_from_path(audio_path)
return {
"role": "user",
"content": [
{"type": "video_url", "video_url": {"url": video_url}},
{"type": "audio_url", "audio_url": {"url": audio_url}},
{"type": "text", "text": question},
],
}
query_map = {
"text": get_text_query,
"use_audio": get_audio_query,
"use_image": get_image_query,
"use_video": get_video_query,
"use_mixed_modalities": get_mixed_modalities_query,
"use_multi_audios": get_multi_audios_query,
"use_audio_in_video": get_use_audio_in_video_query,
}
def run_multimodal_generation(args) -> None:
model_name = args.model
thinker_sampling_params = {
"temperature": 0.4, # Deterministic
"top_p": 0.9,
"top_k": 1,
"max_tokens": 16384,
"repetition_penalty": 1.05,
"stop_token_ids": [151645], # Qwen EOS token <|im_end|>
"seed": SEED,
}
# Sampling parameters for Talker stage (codec generation)
# Stop at codec EOS token
talker_sampling_params = {
"temperature": 0.9,
"top_k": 50,
"max_tokens": 4096,
"seed": SEED,
"detokenize": False,
"repetition_penalty": 1.05,
"stop_token_ids": [2150], # TALKER_CODEC_EOS_TOKEN_ID
}
# # Sampling parameters for Code2Wav stage (audio generation)
code2wav_sampling_params = {
"temperature": 0.0,
"top_p": 1.0,
"top_k": -1,
"max_tokens": 4096 * 16,
"seed": SEED,
"detokenize": True,
"repetition_penalty": 1.1,
}
sampling_params_list = [
thinker_sampling_params,
talker_sampling_params,
code2wav_sampling_params,
]
# Get paths and custom prompt from args
video_path = getattr(args, "video_path", None)
image_path = getattr(args, "image_path", None)
audio_path = getattr(args, "audio_path", None)
custom_prompt = getattr(args, "prompt", None)
# Get the query function and call it with appropriate parameters
query_func = query_map[args.query_type]
if args.query_type == "use_video":
prompt = query_func(video_path=video_path, custom_prompt=custom_prompt)
elif args.query_type == "use_image":
prompt = query_func(image_path=image_path, custom_prompt=custom_prompt)
elif args.query_type == "use_audio":
prompt = query_func(audio_path=audio_path, custom_prompt=custom_prompt)
elif args.query_type == "text":
prompt = query_func(custom_prompt=custom_prompt)
elif args.query_type == "use_audio_in_video":
prompt = query_func(
video_path=video_path,
audio_path=audio_path,
custom_prompt=custom_prompt,
)
else:
prompt = query_func()
extra_body = {
"sampling_params_list": sampling_params_list # Optional, it has a default setting in stage_configs of the corresponding model.
}
if args.query_type == "use_audio_in_video":
extra_body["mm_processor_kwargs"] = {"use_audio_in_video": True}
if args.modalities is not None:
output_modalities = args.modalities.split(",")
else:
output_modalities = None
# Test multiple concurrent completions
num_concurrent_requests = args.num_concurrent_requests
with concurrent.futures.ThreadPoolExecutor(max_workers=num_concurrent_requests) as executor:
# Submit multiple completion requests concurrently
futures = [
executor.submit(
client.chat.completions.create,
messages=[
get_system_prompt(),
prompt,
],
model=model_name,
modalities=output_modalities,
extra_body=extra_body,
stream=args.stream,
)
for _ in range(num_concurrent_requests)
]
# Wait for all requests to complete and collect results
chat_completions = [future.result() for future in concurrent.futures.as_completed(futures)]
assert len(chat_completions) == num_concurrent_requests
count = 0
if not args.stream:
# Verify all completions succeeded
for chat_completion in chat_completions:
for choice in chat_completion.choices:
if choice.message.audio:
audio_data = base64.b64decode(choice.message.audio.data)
audio_file_path = f"audio_{count}.wav"
with open(audio_file_path, "wb") as f:
f.write(audio_data)
print(f"Audio saved to {audio_file_path}")
count += 1
elif choice.message.content:
print("Chat completion output from text:", choice.message.content)
else:
printed_content = False
for chat_completion in chat_completions:
for chunk in chat_completion:
for choice in chunk.choices:
if hasattr(choice, "delta"):
content = getattr(choice.delta, "content", None)
else:
content = None
if getattr(chunk, "modality", None) == "audio" and content:
audio_data = base64.b64decode(content)
audio_file_path = f"audio_{count}.wav"
with open(audio_file_path, "wb") as f:
f.write(audio_data)
print(f"\nAudio saved to {audio_file_path}")
count += 1
elif getattr(chunk, "modality", None) == "text":
if not printed_content:
printed_content = True
print("\ncontent:", end="", flush=True)
print(content, end="", flush=True)
def parse_args():
parser = FlexibleArgumentParser(description="Demo on using vLLM for offline inference with audio language models")
parser.add_argument(
"--query-type",
"-q",
type=str,
default="use_mixed_modalities",
choices=query_map.keys(),
help="Query type.",
)
parser.add_argument(
"--model",
"-m",
type=str,
default="Qwen/Qwen3-Omni-30B-A3B-Instruct",
help="Model Name / Path",
)
parser.add_argument(
"--video-path",
"-v",
type=str,
default=None,
help="Path to local video file or URL. If not provided and query-type is 'use_video', uses default video URL.",
)
parser.add_argument(
"--image-path",
"-i",
type=str,
default=None,
help="Path to local image file or URL. If not provided and query-type is 'use_image', uses default image URL.",
)
parser.add_argument(
"--audio-path",
"-a",
type=str,
default=None,
help="Path to local audio file or URL. If not provided and query-type is 'use_audio', uses default audio URL.",
)
parser.add_argument(
"--prompt",
"-p",
type=str,
default=None,
help="Custom text prompt/question to use instead of the default prompt for the selected query type.",
)
parser.add_argument(
"--modalities",
type=str,
default=None,
help="Output modalities to use for the prompts.",
)
parser.add_argument(
"--stream",
action="store_true",
help="Stream the response.",
)
parser.add_argument(
"--num-concurrent-requests",
type=int,
default=1,
help="Number of concurrent requests to send. Default is 1.",
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
run_multimodal_generation(args)
# Stage config for running Qwen3-Omni-MoE-Thinking (text-only output)
# This config is for models like Qwen3-Omni-30B-A3B-Thinking that only have the
# thinker component and do not support audio output.
#
# Single stage: Thinker (multimodal understanding + text generation)
# The following config has been verified on 2x H100-80G GPUs.
stage_args:
- stage_id: 0
runtime:
devices: "0,1"
max_batch_size: 1
engine_args:
model_stage: thinker
model_arch: Qwen3OmniMoeForConditionalGeneration
worker_type: ar
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
gpu_memory_utilization: 0.9
enforce_eager: true
trust_remote_code: true
engine_output_type: text
distributed_executor_backend: "mp"
enable_prefix_caching: false
hf_config_name: thinker_config
tensor_parallel_size: 2
final_output: true
final_output_type: text
is_comprehension: true
default_sampling_params:
temperature: 0.4
top_p: 0.9
top_k: 1
max_tokens: 2048
seed: 42
detokenize: True
repetition_penalty: 1.05
#!/usr/bin/env bash
set -euo pipefail
# Default query type
QUERY_TYPE="${1:-use_video}"
# Default modalities argument
MODALITIES="${2:-null}"
# Validate query type
if [[ ! "$QUERY_TYPE" =~ ^(text|use_audio|use_image|use_video)$ ]]; then
echo "Error: Invalid query type '$QUERY_TYPE'"
echo "Usage: $0 [text|use_audio|use_image|use_video] [modalities]"
echo " text: Text query"
echo " use_audio: Audio + Text query"
echo " use_image: Image + Text query"
echo " use_video: Video + Text query"
echo " modalities: Modalities parameter (default: null)"
exit 1
fi
SEED=42
thinker_sampling_params='{
"temperature": 0.4,
"top_p": 0.9,
"top_k": 1,
"max_tokens": 16384,
"seed": 42,
"repetition_penalty": 1.05,
"stop_token_ids": [151645]
}'
talker_sampling_params='{
"temperature": 0.9,
"top_k": 50,
"max_tokens": 4096,
"seed": 42,
"detokenize": false,
"repetition_penalty": 1.05,
"stop_token_ids": [2150]
}'
code2wav_sampling_params='{
"temperature": 0.0,
"top_p": 1.0,
"top_k": -1,
"max_tokens": 65536,
"seed": 42,
"detokenize": true,
"repetition_penalty": 1.1
}'
# Above is optional, it has a default setting in stage_configs of the corresponding model.
# Define URLs for assets
MARY_HAD_LAMB_AUDIO_URL="https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/mary_had_lamb.ogg"
CHERRY_BLOSSOM_IMAGE_URL="https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/cherry_blossom.jpg"
SAMPLE_VIDEO_URL="https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4"
# Build user content and extra fields based on query type
case "$QUERY_TYPE" in
text)
user_content='[
{
"type": "text",
"text": "Explain the system architecture for a scalable audio generation pipeline. Answer in 15 words."
}
]'
sampling_params_list='[
'"$thinker_sampling_params"',
'"$talker_sampling_params"',
'"$code2wav_sampling_params"'
]'
mm_processor_kwargs="{}"
;;
use_audio)
user_content='[
{
"type": "audio_url",
"audio_url": {
"url": "'"$MARY_HAD_LAMB_AUDIO_URL"'"
}
},
{
"type": "text",
"text": "What is the content of this audio?"
}
]'
sampling_params_list='[
'"$thinker_sampling_params"',
'"$talker_sampling_params"',
'"$code2wav_sampling_params"'
]'
mm_processor_kwargs="{}"
;;
use_image)
user_content='[
{
"type": "image_url",
"image_url": {
"url": "'"$CHERRY_BLOSSOM_IMAGE_URL"'"
}
},
{
"type": "text",
"text": "What is the content of this image?"
}
]'
sampling_params_list='[
'"$thinker_sampling_params"',
'"$talker_sampling_params"',
'"$code2wav_sampling_params"'
]'
mm_processor_kwargs="{}"
;;
use_video)
user_content='[
{
"type": "video_url",
"video_url": {
"url": "'"$SAMPLE_VIDEO_URL"'"
}
},
{
"type": "text",
"text": "Why is this video funny?"
}
]'
sampling_params_list='[
'"$thinker_sampling_params"',
'"$talker_sampling_params"',
'"$code2wav_sampling_params"'
]'
mm_processor_kwargs="{}"
;;
esac
echo "Running query type: $QUERY_TYPE"
echo ""
output=$(curl -sS -X POST http://localhost:8091/v1/chat/completions \
-H "Content-Type: application/json" \
-d @- <<EOF
{
"model": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
"sampling_params_list": $sampling_params_list,
"mm_processor_kwargs": $mm_processor_kwargs,
"modalities": $MODALITIES,
"messages": [
{
"role": "system",
"content": [
{
"type": "text",
"text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."
}
]
},
{
"role": "user",
"content": $user_content
}
]
}
EOF
)
# Here it only shows the text content of the first choice. Audio content has many binaries, so it's not displayed here.
echo "Output of request: $(echo "$output" | jq '.choices[0].message.content')"
#!/bin/bash
# Convenience script to launch both vLLM server and Gradio demo for Qwen3-Omni
#
# Usage:
# ./run_gradio_demo.sh [OPTIONS]
#
# Example:
# ./run_gradio_demo.sh --model Qwen/Qwen3-Omni-30B-A3B-Instruct --server-port 8091 --gradio-port 7861
set -e
# Default values
MODEL="Qwen/Qwen3-Omni-30B-A3B-Instruct"
SERVER_PORT=8091
GRADIO_PORT=7861
STAGE_CONFIGS_PATH=""
SERVER_HOST="0.0.0.0"
GRADIO_IP="127.0.0.1"
GRADIO_SHARE=false
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL="$2"
shift 2
;;
--server-port)
SERVER_PORT="$2"
shift 2
;;
--gradio-port)
GRADIO_PORT="$2"
shift 2
;;
--stage-configs-path)
STAGE_CONFIGS_PATH="$2"
shift 2
;;
--server-host)
SERVER_HOST="$2"
shift 2
;;
--gradio-ip)
GRADIO_IP="$2"
shift 2
;;
--share)
GRADIO_SHARE=true
shift
;;
--help)
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Options:"
echo " --model MODEL Model name/path (default: Qwen/Qwen3-Omni-30B-A3B-Instruct)"
echo " --server-port PORT Port for vLLM server (default: 8091)"
echo " --gradio-port PORT Port for Gradio demo (default: 7861)"
echo " --stage-configs-path PATH Path to custom stage configs YAML file (optional)"
echo " --server-host HOST Host for vLLM server (default: 0.0.0.0)"
echo " --gradio-ip IP IP for Gradio demo (default: 127.0.0.1)"
echo " --share Share Gradio demo publicly"
echo " --help Show this help message"
echo ""
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use --help for usage information"
exit 1
;;
esac
done
# Get the directory where this script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
API_BASE="http://localhost:${SERVER_PORT}/v1"
HEALTH_URL="http://localhost:${SERVER_PORT}/health"
echo "=========================================="
echo "Starting vLLM-Omni Gradio Demo"
echo "=========================================="
echo "Model: $MODEL"
echo "Server: http://${SERVER_HOST}:${SERVER_PORT}"
echo "Gradio: http://${GRADIO_IP}:${GRADIO_PORT}"
echo "=========================================="
# Build vLLM server command
SERVER_CMD=("vllm" "serve" "$MODEL" "--omni" "--port" "$SERVER_PORT" "--host" "$SERVER_HOST")
if [ -n "$STAGE_CONFIGS_PATH" ]; then
SERVER_CMD+=("--stage-configs-path" "$STAGE_CONFIGS_PATH")
fi
# Function to cleanup on exit
cleanup() {
echo ""
echo "Shutting down..."
if [ -n "$SERVER_PID" ]; then
echo "Stopping vLLM server (PID: $SERVER_PID)..."
kill "$SERVER_PID" 2>/dev/null || true
wait "$SERVER_PID" 2>/dev/null || true
fi
if [ -n "$GRADIO_PID" ]; then
echo "Stopping Gradio demo (PID: $GRADIO_PID)..."
kill "$GRADIO_PID" 2>/dev/null || true
wait "$GRADIO_PID" 2>/dev/null || true
fi
echo "Cleanup complete"
exit 0
}
# Set up signal handlers
trap cleanup SIGINT SIGTERM
# Start vLLM server with output shown in real-time and saved to log
echo ""
echo "Starting vLLM server..."
LOG_FILE="/tmp/vllm_server_${SERVER_PORT}.log"
"${SERVER_CMD[@]}" 2>&1 | tee "$LOG_FILE" &
SERVER_PID=$!
# Start a background process to monitor the log for startup completion
STARTUP_COMPLETE=false
TAIL_PID=""
# Function to cleanup tail process
cleanup_tail() {
if [ -n "$TAIL_PID" ]; then
kill "$TAIL_PID" 2>/dev/null || true
wait "$TAIL_PID" 2>/dev/null || true
fi
}
# Wait for server to be ready by checking log output
echo ""
echo "Waiting for vLLM server to be ready (checking for 'Application startup complete' message)..."
echo ""
# Monitor log file for startup completion message
MAX_WAIT=300 # 5 minutes timeout as fallback
ELAPSED=0
# Use a temporary file to track startup completion
STARTUP_FLAG="/tmp/vllm_startup_flag_${SERVER_PORT}.tmp"
rm -f "$STARTUP_FLAG"
# Start monitoring in background
(
tail -f "$LOG_FILE" 2>/dev/null | grep -m 1 "Application startup complete" > /dev/null && touch "$STARTUP_FLAG"
) &
TAIL_PID=$!
while [ $ELAPSED -lt $MAX_WAIT ]; do
# Check if startup flag file exists (startup complete)
if [ -f "$STARTUP_FLAG" ]; then
cleanup_tail
echo ""
echo "✓ vLLM server is ready!"
STARTUP_COMPLETE=true
break
fi
# Check if server process is still running
if ! kill -0 "$SERVER_PID" 2>/dev/null; then
cleanup_tail
echo ""
echo "Error: vLLM server failed to start (process terminated)"
wait "$SERVER_PID" 2>/dev/null || true
exit 1
fi
sleep 1
ELAPSED=$((ELAPSED + 1))
done
cleanup_tail
rm -f "$STARTUP_FLAG"
if [ "$STARTUP_COMPLETE" != "true" ]; then
echo ""
echo "Error: vLLM server did not complete startup within ${MAX_WAIT} seconds"
kill "$SERVER_PID" 2>/dev/null || true
exit 1
fi
# Start Gradio demo
echo ""
echo "Starting Gradio demo..."
cd "$SCRIPT_DIR"
GRADIO_CMD=("python" "gradio_demo.py" "--model" "$MODEL" "--api-base" "$API_BASE" "--ip" "$GRADIO_IP" "--port" "$GRADIO_PORT")
if [ "$GRADIO_SHARE" = true ]; then
GRADIO_CMD+=("--share")
fi
"${GRADIO_CMD[@]}" > /tmp/gradio_demo.log 2>&1 &
GRADIO_PID=$!
echo ""
echo "=========================================="
echo "Both services are running!"
echo "=========================================="
echo "vLLM Server: http://${SERVER_HOST}:${SERVER_PORT}"
echo "Gradio Demo: http://${GRADIO_IP}:${GRADIO_PORT}"
echo ""
echo "Press Ctrl+C to stop both services"
echo "=========================================="
echo ""
# Wait for either process to exit
wait $SERVER_PID $GRADIO_PID || true
cleanup
# Qwen3-TTS Online Serving
This directory contains examples for running Qwen3-TTS models with vLLM-Omni's online serving API.
## Supported Models
| Model | Task Type | Description |
|-------|-----------|-------------|
| `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` | CustomVoice | Predefined speaker voices with optional style control |
| `Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign` | VoiceDesign | Natural language voice style description |
| `Qwen/Qwen3-TTS-12Hz-1.7B-Base` | Base | Voice cloning from reference audio |
## Quick Start
### 1. Start the Server
```bash
# CustomVoice model (default)
./run_server.sh
# Or specify task type
./run_server.sh CustomVoice
./run_server.sh VoiceDesign
./run_server.sh Base
```
### 2. Run the Client
```bash
# CustomVoice: Use predefined speaker
python openai_speech_client.py \
--text "你好,我是通义千问" \
--voice Vivian \
--language Chinese
# CustomVoice with style instruction
python openai_speech_client.py \
--text "今天天气真好" \
--voice Ryan \
--instructions "用开心的语气说"
# VoiceDesign: Describe the voice style
python openai_speech_client.py \
--model Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign \
--task-type VoiceDesign \
--text "哥哥,你回来啦" \
--instructions "体现撒娇稚嫩的萝莉女声,音调偏高"
# Base: Voice cloning
python openai_speech_client.py \
--model Qwen/Qwen3-TTS-12Hz-1.7B-Base \
--task-type Base \
--text "Hello, this is a cloned voice" \
--ref-audio /path/to/reference.wav \
--ref-text "Original transcript of the reference audio"
```
### 3. Using curl
```bash
# Simple TTS request
curl -X POST http://localhost:8000/v1/audio/speech \
-H "Content-Type: application/json" \
-d '{
"input": "Hello, how are you?",
"voice": "Vivian",
"language": "English"
}' --output output.wav
# With style instruction
curl -X POST http://localhost:8000/v1/audio/speech \
-H "Content-Type: application/json" \
-d '{
"input": "I am so excited!",
"voice": "Vivian",
"instructions": "Speak with great enthusiasm"
}' --output excited.wav
# List available voices in CustomVoice models
curl http://localhost:8000/v1/audio/voices
```
## API Reference
### Endpoint
```
POST /v1/audio/speech
```
This endpoint follows the [OpenAI Audio Speech API](https://platform.openai.com/docs/api-reference/audio/createSpeech) format with additional Qwen3-TTS parameters.
### Request Body
```json
{
"input": "Text to synthesize",
"voice": "Vivian",
"response_format": "wav",
"task_type": "CustomVoice",
"language": "Auto",
"instructions": "Optional style instructions",
"ref_audio": "URL or base64 for voice cloning",
"ref_text": "Reference audio transcript",
"x_vector_only_mode": false,
"max_new_tokens": 2048
}
```
> **Note:** The `model` field is optional when serving a single model, as the server already knows which model is loaded.
### Response
Returns audio data in the requested format (default: WAV).
## Parameters
### Standard OpenAI Parameters
| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| `input` | string | required | Text to synthesize |
| `voice` | string | "Vivian" | Speaker/voice name |
| `response_format` | string | "wav" | Audio format: wav, mp3, flac, pcm, aac, opus |
| `speed` | float | 1.0 | Playback speed (0.25-4.0) |
| `model` | string | optional | Model name (optional when serving single model) |
### Qwen3-TTS Parameters
| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| `task_type` | string | "CustomVoice" | Task: CustomVoice, VoiceDesign, or Base |
| `language` | string | "Auto" | Language: Auto, Chinese, English, Japanese, Korean |
| `instructions` | string | "" | Voice style/emotion instructions |
| `max_new_tokens` | int | 2048 | Maximum tokens to generate |
### Voice Clone Parameters (Base task)
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
| `ref_audio` | string | Yes* | Reference audio (file path, URL, or base64) |
| `ref_text` | string | No | Transcript of reference audio (for ICL mode) |
| `x_vector_only_mode` | bool | false | Use speaker embedding only (no ICL) |
## Python Usage
```python
import httpx
# Simple request
response = httpx.post(
"http://localhost:8000/v1/audio/speech",
json={
"model": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
"input": "Hello world",
"voice": "Vivian",
},
timeout=300.0,
)
with open("output.wav", "wb") as f:
f.write(response.content)
```
## Limitations
- **No streaming**: Audio is generated completely before being returned. Streaming will be supported after the pipeline is disaggregated (see RFC #938).
- **Single request**: Batch processing is not yet optimized for online serving.
## Troubleshooting
1. **Connection refused**: Make sure the server is running on the correct port
2. **Out of memory**: Reduce `--gpu-memory-utilization` in run_server.sh
3. **Unsupported speaker**: Check supported speakers via model documentation
4. **Voice clone fails**: Ensure you're using the Base model variant for voice cloning
"""OpenAI-compatible client for Qwen3-TTS via /v1/audio/speech endpoint.
This script demonstrates how to use the OpenAI-compatible speech API
to generate audio from text using Qwen3-TTS models.
Examples:
# CustomVoice task (predefined speaker)
python openai_speech_client.py --text "Hello, how are you?" --voice Vivian
# CustomVoice with emotion instruction
python openai_speech_client.py --text "I'm so happy!" --voice Vivian \
--instructions "Speak with excitement"
# VoiceDesign task (voice from description)
python openai_speech_client.py --text "Hello world" \
--task-type VoiceDesign \
--instructions "A warm, friendly female voice"
# Base task (voice cloning)
python openai_speech_client.py --text "Hello world" \
--task-type Base \
--ref-audio "https://example.com/reference.wav" \
--ref-text "This is the reference transcript"
"""
import argparse
import base64
import os
import httpx
# Default server configuration
DEFAULT_API_BASE = "http://localhost:8000"
DEFAULT_API_KEY = "EMPTY"
def encode_audio_to_base64(audio_path: str) -> str:
"""Encode a local audio file to base64 data URL."""
if not os.path.exists(audio_path):
raise FileNotFoundError(f"Audio file not found: {audio_path}")
# Detect MIME type from extension
audio_path_lower = audio_path.lower()
if audio_path_lower.endswith(".wav"):
mime_type = "audio/wav"
elif audio_path_lower.endswith((".mp3", ".mpeg")):
mime_type = "audio/mpeg"
elif audio_path_lower.endswith(".flac"):
mime_type = "audio/flac"
elif audio_path_lower.endswith(".ogg"):
mime_type = "audio/ogg"
else:
mime_type = "audio/wav" # Default
with open(audio_path, "rb") as f:
audio_bytes = f.read()
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")
return f"data:{mime_type};base64,{audio_b64}"
def run_tts_generation(args) -> None:
"""Run TTS generation via OpenAI-compatible /v1/audio/speech API."""
# Build request payload
payload = {
"model": args.model,
"input": args.text,
"voice": args.voice,
"response_format": args.response_format,
}
# Add optional parameters
if args.instructions:
payload["instructions"] = args.instructions
if args.task_type:
payload["task_type"] = args.task_type
if args.language:
payload["language"] = args.language
if args.max_new_tokens:
payload["max_new_tokens"] = args.max_new_tokens
# Voice clone parameters (Base task)
if args.ref_audio:
if args.ref_audio.startswith(("http://", "https://")):
payload["ref_audio"] = args.ref_audio
else:
payload["ref_audio"] = encode_audio_to_base64(args.ref_audio)
if args.ref_text:
payload["ref_text"] = args.ref_text
if args.x_vector_only:
payload["x_vector_only_mode"] = True
print(f"Model: {args.model}")
print(f"Task type: {args.task_type or 'CustomVoice'}")
print(f"Text: {args.text}")
print(f"Voice: {args.voice}")
print("Generating audio...")
# Make the API call
api_url = f"{args.api_base}/v1/audio/speech"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {args.api_key}",
}
with httpx.Client(timeout=300.0) as client:
response = client.post(api_url, json=payload, headers=headers)
if response.status_code != 200:
print(f"Error: {response.status_code}")
print(response.text)
return
if response.content.decode("utf-8").startswith('{"error"'):
print(f"Error: {response.content.decode('utf-8')}")
return
# Save audio response
output_path = args.output or "tts_output.wav"
with open(output_path, "wb") as f:
f.write(response.content)
print(f"Audio saved to: {output_path}")
def parse_args():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(
description="OpenAI-compatible client for Qwen3-TTS via /v1/audio/speech",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
# Server configuration
parser.add_argument(
"--api-base",
type=str,
default=DEFAULT_API_BASE,
help=f"API base URL (default: {DEFAULT_API_BASE})",
)
parser.add_argument(
"--api-key",
type=str,
default=DEFAULT_API_KEY,
help="API key (default: EMPTY)",
)
parser.add_argument(
"--model",
"-m",
type=str,
default="Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
help="Model name/path",
)
# Task configuration
parser.add_argument(
"--task-type",
"-t",
type=str,
default=None,
choices=["CustomVoice", "VoiceDesign", "Base"],
help="TTS task type (default: CustomVoice)",
)
# Input text
parser.add_argument(
"--text",
type=str,
required=True,
help="Text to synthesize",
)
# Voice/speaker
parser.add_argument(
"--voice",
type=str,
default="Vivian",
help="Speaker/voice name (default: Vivian). Options: Vivian, Ryan, etc.",
)
parser.add_argument(
"--language",
type=str,
default=None,
help="Language: Auto, Chinese, English, etc.",
)
parser.add_argument(
"--instructions",
type=str,
default=None,
help="Voice style/emotion instructions",
)
# Base (voice clone) parameters
parser.add_argument(
"--ref-audio",
type=str,
default=None,
help="Reference audio file path or URL for voice cloning (Base task)",
)
parser.add_argument(
"--ref-text",
type=str,
default=None,
help="Reference audio transcript for voice cloning (Base task)",
)
parser.add_argument(
"--x-vector-only",
action="store_true",
help="Use x-vector only mode for voice cloning (no ICL)",
)
# Generation parameters
parser.add_argument(
"--max-new-tokens",
type=int,
default=None,
help="Maximum new tokens to generate",
)
# Output
parser.add_argument(
"--response-format",
type=str,
default="wav",
choices=["wav", "mp3", "flac", "pcm", "aac", "opus"],
help="Audio output format (default: wav)",
)
parser.add_argument(
"--output",
"-o",
type=str,
default=None,
help="Output audio file path (default: tts_output.wav)",
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
run_tts_generation(args)
#!/bin/bash
# Launch vLLM-Omni server for Qwen3-TTS models
#
# Usage:
# ./run_server.sh # Default: CustomVoice model
# ./run_server.sh CustomVoice # CustomVoice model
# ./run_server.sh VoiceDesign # VoiceDesign model
# ./run_server.sh Base # Base (voice clone) model
set -e
TASK_TYPE="${1:-CustomVoice}"
case "$TASK_TYPE" in
CustomVoice)
MODEL="Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
;;
VoiceDesign)
MODEL="Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign"
;;
Base)
MODEL="Qwen/Qwen3-TTS-12Hz-1.7B-Base"
;;
*)
echo "Unknown task type: $TASK_TYPE"
echo "Supported: CustomVoice, VoiceDesign, Base"
exit 1
;;
esac
echo "Starting Qwen3-TTS server with model: $MODEL"
vllm-omni serve "$MODEL" \
--stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \
--host 0.0.0.0 \
--port 8000 \
--gpu-memory-utilization 0.9 \
--trust-remote-code \
--enforce-eager \
--omni
# Text-To-Image
This example demonstrates how to deploy Qwen-Image model for online image generation service using vLLM-Omni.
## Start Server
### Basic Start
```bash
vllm serve Qwen/Qwen-Image --omni --port 8091
```
!!! note
If you encounter Out-of-Memory (OOM) issues or have limited GPU memory, you can enable VAE slicing and tiling to reduce memory usage, --vae-use-slicing --vae-use-tiling
### Start with Parameters
Or use the startup script:
```bash
bash run_server.sh
```
## API Calls
### Method 1: Using curl
```bash
# Basic text-to-image generation
bash run_curl_text_to_image.sh
# Or execute directly
curl -s http://localhost:8091/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"messages": [
{"role": "user", "content": "A beautiful landscape painting"}
],
"extra_body": {
"height": 1024,
"width": 1024,
"num_inference_steps": 50,
"true_cfg_scale": 4.0,
"seed": 42
}
}' | jq -r '.choices[0].message.content[0].image_url.url' | cut -d',' -f2- | base64 -d > output.png
```
### Method 2: Using Python Client
```bash
python openai_chat_client.py --prompt "A beautiful landscape painting" --output output.png
```
### Method 3: Using Gradio Demo
```bash
python gradio_demo.py
# Visit http://localhost:7860
```
## Request Format
### Simple Text Generation
```json
{
"messages": [
{"role": "user", "content": "A beautiful landscape painting"}
]
}
```
### Generation with Parameters
Use `extra_body` to pass generation parameters:
```json
{
"messages": [
{"role": "user", "content": "A beautiful landscape painting"}
],
"extra_body": {
"height": 1024,
"width": 1024,
"num_inference_steps": 50,
"true_cfg_scale": 4.0,
"seed": 42
}
}
```
### Multimodal Input (Text + Structured Content)
```json
{
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": "A beautiful landscape painting"}
]
}
]
}
```
## Generation Parameters (extra_body)
| Parameter | Type | Default | Description |
| ------------------------ | ----- | ------- | ------------------------------ |
| `height` | int | None | Image height in pixels |
| `width` | int | None | Image width in pixels |
| `size` | str | None | Image size (e.g., "1024x1024") |
| `num_inference_steps` | int | 50 | Number of denoising steps |
| `true_cfg_scale` | float | 4.0 | Qwen-Image CFG scale |
| `seed` | int | None | Random seed (reproducible) |
| `negative_prompt` | str | None | Negative prompt |
| `num_outputs_per_prompt` | int | 1 | Number of images to generate |
| `--cfg-parallel-size`. | int | 1 | Number of GPUs for CFG parallelism |
## Response Format
```json
{
"id": "chatcmpl-xxx",
"created": 1234567890,
"model": "Qwen/Qwen-Image",
"choices": [{
"index": 0,
"message": {
"role": "assistant",
"content": [{
"type": "image_url",
"image_url": {
"url": "data:image/png;base64,..."
}
}]
},
"finish_reason": "stop"
}],
"usage": {...}
}
```
## Extract Image
```bash
# Extract base64 from response and decode to image
cat response.json | jq -r '.choices[0].message.content[0].image_url.url' | cut -d',' -f2- | base64 -d > output.png
```
## File Description
| File | Description |
| --------------------------- | ---------------------------- |
| `run_server.sh` | Server startup script |
| `run_curl_text_to_image.sh` | curl example |
| `openai_chat_client.py` | Python client |
| `gradio_demo.py` | Gradio interactive interface |
#!/usr/bin/env python3
"""
Qwen-Image Gradio Demo for online serving.
Usage:
python gradio_demo.py [--server http://localhost:8091] [--port 7860]
"""
import argparse
import base64
from io import BytesIO
import gradio as gr
import requests
from PIL import Image
def generate_image(
prompt: str,
height: int,
width: int,
steps: int,
cfg_scale: float,
seed: int | None,
negative_prompt: str,
server_url: str,
num_outputs_per_prompt: int = 1,
) -> Image.Image | None:
"""Generate an image using the chat completions API."""
messages = [{"role": "user", "content": prompt}]
# Build extra_body with generation parameters
extra_body = {
"height": height,
"width": width,
"num_inference_steps": steps,
"true_cfg_scale": cfg_scale,
}
if seed is not None and seed >= 0:
extra_body["seed"] = seed
if negative_prompt:
extra_body["negative_prompt"] = negative_prompt
# Keep consistent with run_curl_text_to_image.sh, always send num_outputs_per_prompt
extra_body["num_outputs_per_prompt"] = num_outputs_per_prompt
# Build request payload
payload = {"messages": messages, "extra_body": extra_body}
try:
response = requests.post(
f"{server_url}/v1/chat/completions",
headers={"Content-Type": "application/json"},
json=payload,
timeout=300,
)
response.raise_for_status()
data = response.json()
content = data["choices"][0]["message"]["content"]
if isinstance(content, list) and len(content) > 0:
image_url = content[0].get("image_url", {}).get("url", "")
if image_url.startswith("data:image"):
_, b64_data = image_url.split(",", 1)
image_bytes = base64.b64decode(b64_data)
return Image.open(BytesIO(image_bytes))
return None
except Exception as e:
print(f"Error: {e}")
raise gr.Error(f"Generation failed: {e}")
def create_demo(server_url: str):
"""Create Gradio demo interface."""
with gr.Blocks(title="Qwen-Image Demo") as demo:
gr.Markdown("# Qwen-Image Online Generation")
gr.Markdown("Generate images using Qwen-Image model")
with gr.Row():
with gr.Column(scale=1):
prompt = gr.Textbox(
label="Prompt",
placeholder="Describe the image you want to generate...",
lines=3,
)
negative_prompt = gr.Textbox(
label="Negative Prompt",
placeholder="Describe what you don't want...",
lines=2,
)
with gr.Row():
height = gr.Slider(
label="Height",
minimum=256,
maximum=2048,
value=1024,
step=64,
)
width = gr.Slider(
label="Width",
minimum=256,
maximum=2048,
value=1024,
step=64,
)
with gr.Row():
steps = gr.Slider(
label="Inference Steps",
minimum=10,
maximum=100,
# Default steps aligned with run_curl_text_to_image.sh to 100
value=100,
step=5,
)
cfg_scale = gr.Slider(
label="True CFG Scale",
minimum=1.0,
maximum=20.0,
value=4.0,
step=0.5,
)
with gr.Row():
seed = gr.Number(
label="Random Seed (-1 for random)",
value=-1,
precision=0,
)
generate_btn = gr.Button("Generate Image", variant="primary")
with gr.Column(scale=1):
output_image = gr.Image(
label="Generated Image",
type="pil",
)
# Examples
gr.Examples(
examples=[
["A beautiful landscape painting with misty mountains", "", 1024, 1024, 100, 4.0, 42],
["A cute cat sitting on a windowsill with sunlight", "", 1024, 1024, 100, 4.0, 123],
["Cyberpunk style futuristic city with neon lights", "blurry, low quality", 1024, 768, 100, 4.0, 456],
["Chinese ink painting of bamboo forest with a house", "", 768, 1024, 100, 4.0, 789],
],
inputs=[prompt, negative_prompt, height, width, steps, cfg_scale, seed],
)
generate_btn.click(
fn=lambda p, h, w, st, c, se, n: generate_image(
p,
h,
w,
st,
c,
se if se >= 0 else None,
n,
server_url,
1,
),
inputs=[prompt, height, width, steps, cfg_scale, seed, negative_prompt],
outputs=[output_image],
)
return demo
def main():
parser = argparse.ArgumentParser(description="Qwen-Image Gradio Demo")
parser.add_argument("--server", default="http://localhost:8091", help="Server URL")
parser.add_argument("--port", type=int, default=7860, help="Gradio port")
parser.add_argument("--share", action="store_true", help="Create public link")
args = parser.parse_args()
print(f"Connecting to server: {args.server}")
demo = create_demo(args.server)
demo.launch(server_port=args.port, share=args.share)
if __name__ == "__main__":
main()
#!/usr/bin/env python3
"""
Qwen-Image OpenAI-compatible image generation client.
Usage:
python openai_chat_client.py --prompt "A beautiful landscape" --output output.png
python openai_chat_client.py --prompt "A sunset" --height 1024 --width 1024 --steps 50 --seed 42
"""
import argparse
import base64
from pathlib import Path
import requests
def generate_image(
prompt: str,
server_url: str = "http://localhost:8091",
height: int | None = None,
width: int | None = None,
steps: int | None = None,
true_cfg_scale: float | None = None,
seed: int | None = None,
negative_prompt: str | None = None,
num_outputs_per_prompt: int = 1,
) -> bytes | None:
"""Generate an image using the images generation API.
Args:
prompt: Text description of the image
server_url: Server URL
height: Image height in pixels
width: Image width in pixels
steps: Number of diffusion steps
true_cfg_scale: Qwen-Image CFG scale
seed: Random seed
negative_prompt: Negative prompt
num_outputs_per_prompt: Number of images to generate
Returns:
Image bytes or None if failed
"""
payload: dict[str, object] = {
"prompt": prompt,
"response_format": "b64_json",
"n": num_outputs_per_prompt,
}
if width is not None and height is not None:
payload["size"] = f"{width}x{height}"
elif width is not None:
payload["size"] = f"{width}x{width}"
elif height is not None:
payload["size"] = f"{height}x{height}"
if steps is not None:
payload["num_inference_steps"] = steps
if true_cfg_scale is not None:
payload["true_cfg_scale"] = true_cfg_scale
if negative_prompt:
payload["negative_prompt"] = negative_prompt
if seed is not None:
payload["seed"] = seed
try:
response = requests.post(
f"{server_url}/v1/images/generations",
headers={"Content-Type": "application/json"},
json=payload,
timeout=300,
)
response.raise_for_status()
data = response.json()
items = data.get("data")
if isinstance(items, list) and items:
first = items[0].get("b64_json") if isinstance(items[0], dict) else None
if isinstance(first, str):
return base64.b64decode(first)
print(f"Unexpected response format: {data}")
return None
except Exception as e:
print(f"Error: {e}")
return None
def main():
parser = argparse.ArgumentParser(description="Qwen-Image chat client")
parser.add_argument("--prompt", "-p", default="a cup of coffee on the table", help="Text prompt")
parser.add_argument("--output", "-o", default="qwen_image_output.png", help="Output file")
parser.add_argument("--server", "-s", default="http://localhost:8091", help="Server URL")
parser.add_argument("--height", type=int, default=1024, help="Image height")
parser.add_argument("--width", type=int, default=1024, help="Image width")
parser.add_argument("--steps", type=int, default=50, help="Inference steps")
parser.add_argument("--cfg-scale", type=float, default=4.0, help="True CFG scale")
parser.add_argument("--seed", type=int, default=0, help="Random seed")
parser.add_argument("--negative", help="Negative prompt")
args = parser.parse_args()
print(f"Generating image for: {args.prompt}")
image_bytes = generate_image(
prompt=args.prompt,
server_url=args.server,
height=args.height,
width=args.width,
steps=args.steps,
true_cfg_scale=args.cfg_scale,
seed=args.seed,
negative_prompt=args.negative,
)
if image_bytes:
output_path = Path(args.output)
output_path.write_bytes(image_bytes)
print(f"Image saved to: {output_path}")
print(f"Size: {len(image_bytes) / 1024:.1f} KB")
else:
print("Failed to generate image")
exit(1)
if __name__ == "__main__":
main()
#!/bin/bash
# Qwen-Image text-to-image curl example
curl -X POST http://localhost:8091/v1/images/generations \
-H "Content-Type: application/json" \
-d '{
"prompt": "a dragon laying over the spine of the Green Mountains of Vermont",
"size": "1024x1024",
"seed": 42
}' | jq -r '.data[0].b64_json' | base64 -d > dragon.png
#!/bin/bash
# Qwen-Image online serving startup script
MODEL="${MODEL:-Qwen/Qwen-Image}"
PORT="${PORT:-8091}"
echo "Starting Qwen-Image server..."
echo "Model: $MODEL"
echo "Port: $PORT"
vllm serve "$MODEL" --omni \
--port "$PORT"
site_name: vLLM-Omni
site_description: Efficient omni-modality model serving for everyone
site_author: vLLM-Omni Team
site_url: https://vllm-project.github.io/vllm-omni/
repo_name: vllm-project/vllm-omni
repo_url: https://github.com/vllm-project/vllm-omni
edit_uri: edit/main/docs/
# Copyright
copyright: Copyright &copy; 2025 vLLM-Omni Team
# Theme
theme:
name: material
logo: source/logos/vllm-logo-only-light.ico
favicon: source/logos/vllm-logo-only-light.ico
palette:
# Palette toggle for automatic mode
- media: "(prefers-color-scheme)"
toggle:
icon: material/brightness-auto
name: Switch to light mode
# Palette toggle for light mode
- media: "(prefers-color-scheme: light)"
scheme: default
primary: white
toggle:
icon: material/brightness-7
name: Switch to dark mode
# Palette toggle for dark mode
- media: "(prefers-color-scheme: dark)"
scheme: slate
primary: black
toggle:
icon: material/brightness-2
name: Switch to system preference
features:
- content.action.edit
- content.code.copy
- navigation.instant
- navigation.instant.progress
- navigation.tracking
- navigation.tabs
- navigation.tabs.sticky
- navigation.sections
- navigation.indexes
- navigation.top
- search.suggest
- search.highlight
- search.share
- content.code.annotate
- content.tabs
- content.tooltips
- toc.follow
custom_dir: docs/mkdocs/overrides
hooks:
- docs/mkdocs/hooks/generate_api_readme.py
- docs/mkdocs/hooks/url_schemes.py
- docs/mkdocs/hooks/generate_examples.py
# Exclude include files from navigation warnings
exclude_docs: |
**/*.inc.md
# Plugins
plugins:
- meta
- search
- autorefs
- awesome-nav
- glightbox
- git-revision-date-localized:
# exclude files
exclude:
- api/*
- user_guide/examples/**
- contributing/design_documents/api_design_template.md
- DOCS_GUIDE.md
- minify:
minify_html: true
minify_js: true
minify_css: true
cache_safe: true
js_files: [docs/mkdocs/javascript/*.js]
css_files: [docs/mkdocs/stylesheets/*.css]
- api-autonav:
modules: ["vllm_omni"]
api_root_uri: "api"
nav_item_prefix: "" # No prefix in navigation tree (clean names)
show_full_namespace: false # Show only module name, not full path
on_implicit_namespace_package: skip # Skip directories without __init__.py (e.g., assets)
exclude:
- "re:vllm_omni\\._.*" # Internal modules
- "vllm_omni.diffusion.models.qwen_image" # avoid importing vllm in mkdocs building
- "vllm_omni.entrypoints.async_diffusion" # avoid importing vllm in mkdocs building
- "vllm_omni.entrypoints.openai" # avoid importing vllm in mkdocs building
- "vllm_omni.entrypoints.openai.protocol" # avoid importing vllm in mkdocs building
- mkdocstrings:
handlers:
python:
options:
show_symbol_type_heading: true
show_symbol_type_toc: true
filters:
- "!^_" # Exclude private members (methods/classes starting with underscore)
summary:
modules: true
show_if_no_docstring: true
show_signature_annotations: true
separate_signature: true
show_overloads: true
signature_crossrefs: true
inventories:
- https://docs.python.org/3/objects.inv
- https://typing-extensions.readthedocs.io/en/latest/objects.inv
- https://docs.aiohttp.org/en/stable/objects.inv
- https://pillow.readthedocs.io/en/stable/objects.inv
- https://numpy.org/doc/stable/objects.inv
# Temporarily disabled due to decompression errors
# - https://pytorch.org/docs/stable/objects.inv
- https://psutil.readthedocs.io/en/stable/objects.inv
markdown_extensions:
- attr_list
- md_in_html
- admonition
- pymdownx.details
# For content tabs
- pymdownx.superfences:
custom_fences:
- name: mermaid
class: mermaid
format: !!python/name:pymdownx.superfences.fence_code_format
- pymdownx.tabbed:
slugify: !!python/object/apply:pymdownx.slugs.slugify
kwds:
case: lower
alternate_style: true
# For code highlighting
- pymdownx.highlight:
anchor_linenums: true
line_spans: __span
pygments_lang_class: true
- pymdownx.inlinehilite
- pymdownx.snippets
# For emoji and icons
- pymdownx.emoji:
emoji_index: !!python/name:material.extensions.emoji.twemoji
emoji_generator: !!python/name:material.extensions.emoji.to_svg
# For in page [TOC] (not sidebar)
- toc:
permalink: true
# For math rendering
- pymdownx.arithmatex:
generic: true
extra_css:
- mkdocs/stylesheets/extra.css
extra_javascript:
- mkdocs/javascript/mathjax.js
- https://unpkg.com/mathjax@3.2.2/es5/tex-mml-chtml.js
- https://unpkg.com/mermaid@10/dist/mermaid.min.js
- mkdocs/javascript/mermaid.js
- mkdocs/javascript/edit_and_feedback.js
- mkdocs/javascript/slack_and_forum.js
[build-system]
requires = ["setuptools>=61.0", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "vllm-omni"
version = "0.14.0"
description = "A framework for efficient model inference with omni-modality models"
readme = "README.md"
requires-python = ">=3.10,<3.14"
license = {text = "Apache-2.0"}
authors = [
{name = "vLLM-Omni Team"}
]
keywords = ["vllm", "multimodal", "diffusion", "transformer", "inference", "serving"]
classifiers = [
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Software Development :: Libraries :: Python Modules",
]
dependencies = [
# Core runtime dependencies (required for actual usage)
"omegaconf>=2.3.0",
"librosa>=0.11.0",
"resampy>=0.4.3",
"diffusers>=0.36.0",
"accelerate==1.12.0",
"gradio==5.50",
"soundfile>=0.13.1",
"cache-dit==1.2.0",
"tqdm>=4.66.0",
"torchsde>=0.2.6", # Required for Stable Audio scheduler
"fa3-fwd==0.0.1", # flash attention 3, maintained by @ZJY0516
"openai-whisper>=20250625",
"imageio[ffmpeg]>=2.37.2",
"onnxruntime>=1.19.0",
"sox>=1.5.0",
# "vllm==0.14.0", # TODO: fix the entrypoints overwrite problem
]
[project.optional-dependencies]
dev = [
"pytest>=7.0.0",
"pytest-asyncio>=0.21.0",
"pytest-cov>=4.0.0",
"mypy==1.11.1",
"pre-commit==4.0.1",
"openai-whisper>=20250625",
"psutil>=7.2.0",
"soundfile>=0.13.1",
"imageio[ffmpeg]>=0.6.0",
"opencv-python>=4.12.0.88",
"mooncake-transfer-engine==0.3.8.post1"
]
docs = [
"mkdocs>=1.5.0",
"mkdocs-api-autonav",
"mkdocs-material",
"mkdocstrings-python",
"mkdocs-gen-files",
"mkdocs-awesome-nav",
"mkdocs-glightbox",
"mkdocs-git-revision-date-localized-plugin",
"mkdocs-minify-plugin",
"regex",
"ruff",
"pydantic",
]
[project.urls]
Homepage = "https://github.com/vllm-project/vllm-omni"
Repository = "https://github.com/vllm-project/vllm-omni"
Documentation = "https://vllm-omni.readthedocs.io"
"Bug Tracker" = "https://github.com/vllm-project/vllm-omni/issues"
[project.scripts]
vllm = "vllm_omni.entrypoints.cli.main:main"
vllm-omni = "vllm_omni.entrypoints.cli.main:main"
[tool.setuptools.packages.find]
where = ["."]
include = ["vllm_omni*"]
[tool.setuptools.package-data]
"vllm_omni.model_executor.stage_configs" = ["*.yaml"]
[tool.ruff]
line-length = 120
exclude = [
".eggs",
".git",
".hg",
".mypy_cache",
".tox",
".venv",
"build",
"dist",
"vllm_omni.egg-info",
]
[tool.ruff.lint]
select = [
"E", # pycodestyle errors
"W", # pycodestyle warnings
"F", # pyflakes
"I", # isort (handled separately, but included for compatibility)
"N", # pep8-naming
"UP", # pyupgrade
]
ignore = [
"E203", # whitespace before ':' (conflicts with black)
# W503 is not needed in ruff as it's compatible with black by default
"N801", # class names should use CapWords convention
"N802", # function name should follow snake_case
"N806", # variable in function should follow snake_case
"N812", # lowercase imported as non-lowercase: functional as F
]
[tool.ruff.lint.per-file-ignores]
"examples/**" = ["E501"] # Allow long lines in examples
"tests/**" = ["E501"] # Allow long lines in tests
[tool.mypy]
python_version = "3.12, 3.13"
warn_return_any = true
warn_unused_configs = true
disallow_untyped_defs = true
disallow_incomplete_defs = true
check_untyped_defs = true
disallow_untyped_decorators = true
no_implicit_optional = true
warn_redundant_casts = true
warn_unused_ignores = true
warn_no_return = true
warn_unreachable = true
strict_equality = true
[tool.pytest.ini_options]
testpaths = ["tests"]
python_files = ["test_*.py", "*_test.py"]
python_classes = ["Test*"]
python_functions = ["test_*"]
addopts = [
"--strict-markers",
"--strict-config",
"--cov=vllm_omni",
"--cov-report=term-missing",
"--cov-report=html",
"--cov-report=xml",
]
markers = [
# ci/cd required
"core_model: Core model tests (run in each PR)",
# function module markers
"diffusion: Diffusion model tests",
"omni: Omni model tests",
"cache: Cache backend tests",
"parallel: Parallelism/distributed tests",
# platform markers
"cpu: Tests that run on CPU",
"gpu: Tests that run on GPU (auto-added)",
"cuda: Tests that run on CUDA (auto-added)",
"rocm: Tests that run on AMD/ROCm (auto-added)",
"npu: Tests that run on NPU/Ascend (auto-added)",
# specified computation resources marks (auto-added)
"H100: Tests that require H100 GPU",
"L4: Tests that require L4 GPU",
"MI325: Tests that require MI325 GPU (AMD/ROCm)",
"A2: Tests that require A2 NPU",
"A3: Tests that require A3 NPU",
"distributed_cuda: Tests that require multi cards on CUDA platform",
"distributed_rocm: Tests that require multi cards on ROCm platform",
"distributed_npu: Tests that require multi cards on NPU platform",
"skipif_cuda: Skip if the num of CUDA cards is less than the required",
"skipif_rocm: Skip if the num of ROCm cards is less than the required",
"skipif_npu: Skip if the num of NPU cards is less than the required",
# more detailed markers
"slow: Slow tests (may skip in quick CI)",
"benchmark: Benchmark tests",
]
[tool.typos.default]
extend-ignore-identifiers-re = [
".*_thw",
".*thw",
"ein",
".*arange",
".*MoBA",
".*temperal_downsample",
".*nothink.*",
".*NOTHINK.*",
".*nin.*",
"Ono_Anna",
]
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd -- "${SCRIPT_DIR}/.." && pwd)"
PROJECT_NAME="vllm-omni"
RUN_QUALITY=false
SKIP_CLEAN=false
CREATE_VENV=false
VENV_DIR=".venv-build"
PYTHON_BIN="python"
UV_BIN="uv"
log() {
local level="$1"
shift
printf '[%s] %s\n' "${level}" "$*"
}
abort() {
log "ERROR" "$*"
exit 1
}
usage() {
cat <<EOF
Usage: $(basename "$0") [options]
Options:
--run-quality Run pre-commit, install dev deps, and pytest before building
--skip-clean Skip removing previous build artifacts
--create-venv Build inside a fresh virtual environment (default path: .venv-build)
--venv-dir PATH Custom directory for the virtual environment (implies --create-venv)
--python PATH Python executable to use (default: python)
-h, --help Show this help message
EOF
}
while [[ $# -gt 0 ]]; do
case "$1" in
--run-quality)
RUN_QUALITY=true
;;
--skip-clean)
SKIP_CLEAN=true
;;
--create-venv)
CREATE_VENV=true
;;
--venv-dir)
CREATE_VENV=true
shift
[[ $# -gt 0 ]] || abort "--venv-dir requires a path"
VENV_DIR="$1"
;;
--python)
shift
[[ $# -gt 0 ]] || abort "--python requires a path"
PYTHON_BIN="$1"
;;
-h|--help)
usage
exit 0
;;
*)
usage
abort "Unknown option: $1"
;;
esac
shift
done
HOST_PYTHON="${PYTHON_BIN}"
log "INFO" "Switching to repository root: ${REPO_ROOT}"
cd "${REPO_ROOT}" || abort "Cannot enter repository root"
[[ -f pyproject.toml ]] || abort "pyproject.toml not found, please ensure correct script location"
ensure_uv() {
if ! command -v "${UV_BIN}" >/dev/null 2>&1; then
log "INFO" "uv not found, installing via ${HOST_PYTHON}"
"${HOST_PYTHON}" -m pip install --upgrade pip
"${HOST_PYTHON}" -m pip install uv
fi
}
ensure_uv
if [[ "${CREATE_VENV}" == "true" ]]; then
log "INFO" "Creating fresh virtual environment at ${VENV_DIR} via uv"
"${UV_BIN}" venv --python "${HOST_PYTHON}" --seed "${VENV_DIR}"
PYTHON_BIN="${VENV_DIR}/bin/python"
[[ -x "${PYTHON_BIN}" ]] || abort "Failed to locate python inside ${VENV_DIR}"
log "INFO" "Installing build module inside virtual environment"
"${UV_BIN}" pip install --python "${PYTHON_BIN}" build
else
log "INFO" "Ensuring build module is available via uv pip"
"${UV_BIN}" pip install --python "${PYTHON_BIN}" build
fi
log "INFO" "Checking build module"
if ! "${PYTHON_BIN}" -m build --version >/dev/null 2>&1; then
abort "${PYTHON_BIN} -m build is not available, install build first"
fi
run_quality_steps() {
log "INFO" "Running quality checks"
"${UV_BIN}" pip install --python "${PYTHON_BIN}" -e ".[dev]"
"${PYTHON_BIN}" -m pre_commit run --all-files
"${PYTHON_BIN}" -m pytest tests/ -v -m "not slow"
}
if [[ "${RUN_QUALITY}" == "true" ]]; then
run_quality_steps
else
log "INFO" "Quality steps available via --run-quality"
log "INFO" " - pre-commit run --all-files"
log "INFO" " - pip install -e '.[dev]'"
log "INFO" " - pytest tests/ -v -m \"not slow\""
fi
cleanup_artifacts() {
log "INFO" "Cleaning previous build artifacts"
rm -rf build dist "${PROJECT_NAME}.egg-info" "${PROJECT_NAME//-/_}.egg-info"
}
if [[ "${SKIP_CLEAN}" == "true" ]]; then
log "INFO" "Skipping cleanup as requested"
else
cleanup_artifacts
fi
log "INFO" "Building source and wheel distributions"
"${PYTHON_BIN}" -m build
log "INFO" "Build finished, artifacts:"
ls -lh dist
"""
Test suite for vLLM-Omni.
This package contains unit tests, integration tests, and benchmarks
for vLLM-Omni.
"""
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment