Commit c1cacde6 authored by weishb's avatar weishb
Browse files

vllm-omni_0.15.0.rc1+fix1 first commit

parent 35607782
import argparse
import json
import os
import soundfile as sf
from qwen3_omni_moe_model import Qwen3OmniMoeForConditionalGenerationWithLogging
from qwen_omni_utils import process_mm_info
from tqdm import tqdm
from transformers import Qwen3OmniMoeProcessor
MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
# MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Thinking"
def load_prompts(prompts_file: str) -> list[str]:
"""Load prompts from a text file, one prompt per line."""
prompts = []
with open(prompts_file, encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
prompts.append(line)
return prompts
def run_benchmark(
model,
processor,
prompts: list[str],
output_dir: str = "benchmark_results",
speaker: str = "Ethan",
use_audio_in_video: bool = True,
):
"""
Run benchmark on a list of prompts and collect performance stats.
Args:
model: The Qwen3OmniMoe model
processor: The Qwen3OmniMoe processor
prompts: List of text prompts to process
output_dir: Directory to save results
speaker: Speaker voice for audio output
use_audio_in_video: Whether to use audio in video
Returns:
tuple: (aggregated_stats, results, audio_outputs)
- aggregated_stats: dict with aggregated performance statistics
- results: list of dicts with per-prompt results
- audio_outputs: list of audio tensors/arrays (or None if no audio)
"""
os.makedirs(output_dir, exist_ok=True)
audio_dir = os.path.join(output_dir, "audio")
os.makedirs(audio_dir, exist_ok=True)
all_stats = []
results = []
audio_outputs = []
for idx, prompt in enumerate(tqdm(prompts, desc="Processing prompts")):
conversation = [
{
"role": "user",
"content": [{"type": "text", "text": prompt}],
},
]
# Preparation for inference
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
audios, images, videos = process_mm_info(conversation, use_audio_in_video=use_audio_in_video)
inputs = processor(
text=text,
audio=audios,
images=images,
videos=videos,
return_tensors="pt",
padding=True,
use_audio_in_video=use_audio_in_video,
)
inputs = inputs.to(model.device).to(model.dtype)
# Inference: Generation of the output text and audio
text_ids, audio = model.generate(
**inputs, speaker=speaker, thinker_return_dict_in_generate=True, use_audio_in_video=use_audio_in_video
)
# Decode output text
output_text = processor.batch_decode(
text_ids.sequences[:, inputs["input_ids"].shape[1] :],
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
)[0]
# Collect performance stats
perf_stats = None
if hasattr(model, "_perf_stats_last"):
perf_stats = model._perf_stats_last.copy()
perf_stats["prompt_idx"] = idx
perf_stats["prompt"] = prompt
all_stats.append(perf_stats)
# Save audio and collect audio output
audio_path = None
audio_data = None
if audio is not None:
audio_data = audio.reshape(-1).detach().cpu().numpy()
audio_path = os.path.join(audio_dir, f"output_{idx:04d}.wav")
sf.write(
audio_path,
audio_data,
samplerate=24000,
)
audio_outputs.append(audio_data)
else:
audio_outputs.append(None)
# Save result
result = {
"idx": idx,
"prompt": prompt,
"output": output_text,
"audio_path": audio_path,
"perf_stats": perf_stats,
}
results.append(result)
# Aggregate statistics
aggregated_stats = aggregate_stats(all_stats)
# Save all results
results_path = os.path.join(output_dir, "results.json")
with open(results_path, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
# Save aggregated stats
stats_path = os.path.join(output_dir, "perf_stats.json")
with open(stats_path, "w", encoding="utf-8") as f:
json.dump({"aggregated": aggregated_stats, "per_prompt": all_stats}, f, ensure_ascii=False, indent=2)
# Count saved audio files
num_audio_saved = sum(1 for a in audio_outputs if a is not None)
print(f"\nSaved {num_audio_saved} audio files to {audio_dir}/")
return aggregated_stats, results, audio_outputs
def aggregate_stats(all_stats: list[dict]) -> dict:
"""Aggregate performance statistics from multiple runs."""
if not all_stats:
return {}
keys = [
"thinker_tokens",
"thinker_time_s",
"thinker_tps",
"talker_tokens",
"talker_time_s",
"talker_tps",
"code2wav_tokens",
"code2wav_time_s",
"code2wav_tps",
"total_tokens",
"total_time_s",
"total_tps",
]
aggregated = {
"num_samples": len(all_stats),
}
for key in keys:
values = [s.get(key, 0) for s in all_stats if key in s]
if values:
aggregated[f"{key}_sum"] = sum(values)
aggregated[f"{key}_avg"] = sum(values) / len(values)
aggregated[f"{key}_min"] = min(values)
aggregated[f"{key}_max"] = max(values)
# Calculate overall throughput
total_tokens = aggregated.get("total_tokens_sum", 0)
total_time = aggregated.get("total_time_s_sum", 0)
if total_time > 0:
aggregated["overall_tps"] = total_tokens / total_time
return aggregated
def print_stats(stats: dict):
"""Print performance statistics in a formatted way."""
print("\n" + "=" * 60)
print("Performance Statistics Summary")
print("=" * 60)
print(f"\nNumber of samples: {stats.get('num_samples', 0)}")
print("\n--- Thinker ---")
print(f" Total tokens: {stats.get('thinker_tokens_sum', 0):.0f}")
print(f" Total time: {stats.get('thinker_time_s_sum', 0):.2f}s")
print(f" Avg TPS: {stats.get('thinker_tps_avg', 0):.2f}")
print(f" Min TPS: {stats.get('thinker_tps_min', 0):.2f}")
print(f" Max TPS: {stats.get('thinker_tps_max', 0):.2f}")
print("\n--- Talker ---")
print(f" Total tokens: {stats.get('talker_tokens_sum', 0):.0f}")
print(f" Total time: {stats.get('talker_time_s_sum', 0):.2f}s")
print(f" Avg TPS: {stats.get('talker_tps_avg', 0):.2f}")
print(f" Min TPS: {stats.get('talker_tps_min', 0):.2f}")
print(f" Max TPS: {stats.get('talker_tps_max', 0):.2f}")
print("\n--- Code2Wav ---")
print(f" Total tokens: {stats.get('code2wav_tokens_sum', 0):.0f}")
print(f" Total time: {stats.get('code2wav_time_s_sum', 0):.2f}s")
print(f" Avg TPS: {stats.get('code2wav_tps_avg', 0):.2f}")
print(f" Min TPS: {stats.get('code2wav_tps_min', 0):.2f}")
print(f" Max TPS: {stats.get('code2wav_tps_max', 0):.2f}")
print("\n--- Overall ---")
print(f" Total tokens: {stats.get('total_tokens_sum', 0):.0f}")
print(f" Total time: {stats.get('total_time_s_sum', 0):.2f}s")
print(f" Overall TPS: {stats.get('overall_tps', 0):.2f}")
print(f" Avg TPS: {stats.get('total_tps_avg', 0):.2f}")
print(f" Min TPS: {stats.get('total_tps_min', 0):.2f}")
print(f" Max TPS: {stats.get('total_tps_max', 0):.2f}")
print("=" * 60 + "\n")
def main():
parser = argparse.ArgumentParser(description="Qwen3-Omni Benchmark Script")
parser.add_argument(
"--prompts_file",
type=str,
default="benchmark/build_dataset/top100.txt",
help="Path to the prompts file (one prompt per line)",
)
parser.add_argument(
"--output_dir", type=str, default="benchmark_results", help="Directory to save benchmark results"
)
parser.add_argument("--model_path", type=str, default=MODEL_PATH, help="Path to the model")
parser.add_argument("--speaker", type=str, default="Ethan", help="Speaker voice for audio output")
parser.add_argument("--num_prompts", type=int, default=None, help="Number of prompts to process (default: all)")
args = parser.parse_args()
# Load model and processor
print(f"Loading model from {args.model_path}...")
model = Qwen3OmniMoeForConditionalGenerationWithLogging.from_pretrained(
args.model_path,
dtype="auto",
device_map="auto",
attn_implementation="flash_attention_2",
)
processor = Qwen3OmniMoeProcessor.from_pretrained(args.model_path)
# Benchmark mode
print(f"Loading prompts from {args.prompts_file}...")
prompts = load_prompts(args.prompts_file)
if args.num_prompts:
prompts = prompts[: args.num_prompts]
print(f"Running benchmark on {len(prompts)} prompts...")
aggregated_stats, results, audio_outputs = run_benchmark(
model=model,
processor=processor,
prompts=prompts,
output_dir=args.output_dir,
speaker=args.speaker,
)
print_stats(aggregated_stats)
print(f"\nResults saved to {args.output_dir}/")
if __name__ == "__main__":
main()
#!/bin/bash
# Qwen3-Omni Benchmark Evaluation Script
# This script must be run from the vllm-omni root directory
# Get the directory where this script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Navigate to vllm-omni root directory (4 levels up from script location)
VLLM_OMNI_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
cd "$VLLM_OMNI_ROOT" || { echo "Error: Failed to navigate to vllm-omni directory"; exit 1; }
echo "Working directory: $(pwd)"
# Verify we're in the correct directory and run benchmark
if [[ ! -d "benchmarks/qwen3-omni/vllm_omni" ]]; then
echo "Error: Not in vllm-omni root directory. Please run from vllm-omni folder."
else
log_dir=benchmarks/qwen3-omni/vllm_omni/logs
outputs_dir=benchmarks/qwen3-omni/vllm_omni/outputs
end2end_script_path=examples/offline_inference/qwen3_omni/end2end.py
build_dataset_path=benchmarks/build_dataset/top100.txt
python $end2end_script_path --output-wav $outputs_dir \
--query-type text \
--txt-prompts $build_dataset_path \
--enable-stats \
--log-dir $log_dir
echo "Logs and outputs are saved in ${log_dir} and ${outputs_dir} respectively:"
echo " - omni_llm_pipeline_text run dir/base name"
echo " - omni_llm_pipeline_text.orchestrator.stats.jsonl orchestrator-stage latency stats"
echo " - omni_llm_pipeline_text.overall.stats.jsonl overall latency/TPS stats"
echo " - omni_llm_pipeline_text.stage0.log per-stage detailed logs"
echo " - omni_llm_pipeline_text.stage1.log"
echo " - omni_llm_pipeline_text.stage2.log"
echo "Key checks: overall.stats.jsonl for end-to-end latency/TPS; orchestrator.stats.jsonl for stable per-stage latency; stage*.log for errors or long tails."
echo " - outputs/ Generated txt and wav files, there should be 100 text and wav files generated respectively"
fi
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa
# code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py
import datetime
import locale
import os
import subprocess
import sys
# Unlike the rest of the PyTorch this file must be python2 compliant.
# This script outputs relevant system environment info
# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
from collections import namedtuple
import regex as re
from vllm.envs import environment_variables
try:
import torch
TORCH_AVAILABLE = True
except (ImportError, NameError, AttributeError, OSError):
TORCH_AVAILABLE = False
# System Environment Information
SystemEnv = namedtuple(
"SystemEnv",
[
"torch_version",
"is_debug_build",
"cuda_compiled_version",
"gcc_version",
"clang_version",
"cmake_version",
"os",
"libc_version",
"python_version",
"python_platform",
"is_cuda_available",
"cuda_runtime_version",
"cuda_module_loading",
"nvidia_driver_version",
"nvidia_gpu_models",
"cudnn_version",
"pip_version", # 'pip' or 'pip3'
"pip_packages",
"conda_packages",
"hip_compiled_version",
"hip_runtime_version",
"miopen_runtime_version",
"caching_allocator_config",
"is_xnnpack_available",
"cpu_info",
"rocm_version", # vllm specific field
"vllm_version", # vllm specific field
"vllm_omni_version", # vllm-omni specific field
"vllm_build_flags", # vllm specific field
"gpu_topo", # vllm specific field
"env_vars",
],
)
DEFAULT_CONDA_PATTERNS = {
"torch",
"numpy",
"cudatoolkit",
"soumith",
"mkl",
"magma",
"triton",
"optree",
"nccl",
"transformers",
"zmq",
"nvidia",
"pynvml",
"flashinfer-python",
}
DEFAULT_PIP_PATTERNS = {
"torch",
"numpy",
"mypy",
"flake8",
"triton",
"optree",
"onnx",
"nccl",
"transformers",
"zmq",
"nvidia",
"pynvml",
"flashinfer-python",
}
def run(command):
"""Return (return-code, stdout, stderr)."""
shell = True if type(command) is str else False
try:
p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell)
raw_output, raw_err = p.communicate()
rc = p.returncode
if get_platform() == "win32":
enc = "oem"
else:
enc = locale.getpreferredencoding()
output = raw_output.decode(enc)
if command == "nvidia-smi topo -m":
# don't remove the leading whitespace of `nvidia-smi topo -m`
# because they are meaningful
output = output.rstrip()
else:
output = output.strip()
err = raw_err.decode(enc)
return rc, output, err.strip()
except FileNotFoundError:
cmd_str = command if isinstance(command, str) else command[0]
return 127, "", f"Command not found: {cmd_str}"
def run_and_read_all(run_lambda, command):
"""Run command using run_lambda; reads and returns entire output if rc is 0."""
rc, out, _ = run_lambda(command)
if rc != 0:
return None
return out
def run_and_parse_first_match(run_lambda, command, regex):
"""Run command using run_lambda, returns the first regex match if it exists."""
rc, out, _ = run_lambda(command)
if rc != 0:
return None
match = re.search(regex, out)
if match is None:
return None
return match.group(1)
def run_and_return_first_line(run_lambda, command):
"""Run command using run_lambda and returns first line if output is not empty."""
rc, out, _ = run_lambda(command)
if rc != 0:
return None
return out.split("\n")[0]
def get_conda_packages(run_lambda, patterns=None):
if patterns is None:
patterns = DEFAULT_CONDA_PATTERNS
conda = os.environ.get("CONDA_EXE", "conda")
out = run_and_read_all(run_lambda, [conda, "list"])
if out is None:
return out
return "\n".join(
line for line in out.splitlines() if not line.startswith("#") and any(name in line for name in patterns)
)
def get_gcc_version(run_lambda):
return run_and_parse_first_match(run_lambda, "gcc --version", r"gcc (.*)")
def get_clang_version(run_lambda):
return run_and_parse_first_match(run_lambda, "clang --version", r"clang version (.*)")
def get_cmake_version(run_lambda):
return run_and_parse_first_match(run_lambda, "cmake --version", r"cmake (.*)")
def get_nvidia_driver_version(run_lambda):
if get_platform() == "darwin":
cmd = "kextstat | grep -i cuda"
return run_and_parse_first_match(run_lambda, cmd, r"com[.]nvidia[.]CUDA [(](.*?)[)]")
smi = get_nvidia_smi()
return run_and_parse_first_match(run_lambda, smi, r"Driver Version: (.*?) ")
def get_gpu_info(run_lambda):
if get_platform() == "darwin" or (
TORCH_AVAILABLE and hasattr(torch.version, "hip") and torch.version.hip is not None
):
if TORCH_AVAILABLE and torch.cuda.is_available():
if torch.version.hip is not None:
prop = torch.cuda.get_device_properties(0)
if hasattr(prop, "gcnArchName"):
gcnArch = " ({})".format(prop.gcnArchName)
else:
gcnArch = "NoGCNArchNameOnOldPyTorch"
else:
gcnArch = ""
return torch.cuda.get_device_name(None) + gcnArch
return None
smi = get_nvidia_smi()
uuid_regex = re.compile(r" \(UUID: .+?\)")
rc, out, _ = run_lambda(smi + " -L")
if rc != 0:
return None
# Anonymize GPUs by removing their UUID
return re.sub(uuid_regex, "", out)
def get_running_cuda_version(run_lambda):
return run_and_parse_first_match(run_lambda, "nvcc --version", r"release .+ V(.*)")
def get_cudnn_version(run_lambda):
"""Return a list of libcudnn.so; it's hard to tell which one is being used."""
if get_platform() == "win32":
system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
cuda_path = os.environ.get("CUDA_PATH", "%CUDA_PATH%")
where_cmd = os.path.join(system_root, "System32", "where")
cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path)
elif get_platform() == "darwin":
# CUDA libraries and drivers can be found in /usr/local/cuda/. See
# https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install
# https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac
# Use CUDNN_LIBRARY when cudnn library is installed elsewhere.
cudnn_cmd = "ls /usr/local/cuda/lib/libcudnn*"
else:
cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev'
rc, out, _ = run_lambda(cudnn_cmd)
# find will return 1 if there are permission errors or if not found
if len(out) == 0 or (rc != 1 and rc != 0):
l = os.environ.get("CUDNN_LIBRARY")
if l is not None and os.path.isfile(l):
return os.path.realpath(l)
return None
files_set = set()
for fn in out.split("\n"):
fn = os.path.realpath(fn) # eliminate symbolic links
if os.path.isfile(fn):
files_set.add(fn)
if not files_set:
return None
# Alphabetize the result because the order is non-deterministic otherwise
files = sorted(files_set)
if len(files) == 1:
return files[0]
result = "\n".join(files)
return "Probably one of the following:\n{}".format(result)
def get_nvidia_smi():
# Note: nvidia-smi is currently available only on Windows and Linux
smi = "nvidia-smi"
if get_platform() == "win32":
system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
program_files_root = os.environ.get("PROGRAMFILES", "C:\\Program Files")
legacy_path = os.path.join(program_files_root, "NVIDIA Corporation", "NVSMI", smi)
new_path = os.path.join(system_root, "System32", smi)
smis = [new_path, legacy_path]
for candidate_smi in smis:
if os.path.exists(candidate_smi):
smi = '"{}"'.format(candidate_smi)
break
return smi
def get_rocm_version(run_lambda):
"""Returns the ROCm version if available, otherwise 'N/A'."""
return run_and_parse_first_match(run_lambda, "hipcc --version", r"HIP version: (\S+)")
def get_vllm_version():
from vllm import __version__, __version_tuple__
if __version__ == "dev":
return "N/A (dev)"
version_str = __version_tuple__[-1]
if isinstance(version_str, str) and version_str.startswith("g"):
# it's a dev build
if "." in version_str:
# it's a dev build containing local changes
git_sha = version_str.split(".")[0][1:]
date = version_str.split(".")[-1][1:]
return f"{__version__} (git sha: {git_sha}, date: {date})"
else:
# it's a dev build without local changes
git_sha = version_str[1:] # type: ignore
return f"{__version__} (git sha: {git_sha})"
return __version__
def get_vllm_omni_version(run_lambda):
try:
import vllm_omni
from vllm_omni import __version__, __version_tuple__
version_str = __version_tuple__[-1]
if isinstance(version_str, str) and version_str.startswith("g"):
if "." in version_str:
git_sha = version_str.split(".")[0][1:]
date = version_str.split(".")[-1][1:]
return f"{__version__} (git sha: {git_sha}, date: {date})"
else:
git_sha = version_str[1:]
return f"{__version__} (git sha: {git_sha})"
package_dir = os.path.dirname(os.path.abspath(vllm_omni.__file__))
git_sha = run_and_read_all(run_lambda, f"git -C {package_dir} rev-parse --short HEAD")
if git_sha:
return f"{__version__} (git sha: {git_sha})"
return __version__
except ImportError:
return "N/A (vllm_omni not installed)"
def summarize_vllm_build_flags():
# This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
return "CUDA Archs: {}; ROCm: {}".format(
os.environ.get("TORCH_CUDA_ARCH_LIST", "Not Set"),
"Enabled" if os.environ.get("ROCM_HOME") else "Disabled",
)
def get_gpu_topo(run_lambda):
output = None
if get_platform() == "linux":
output = run_and_read_all(run_lambda, "nvidia-smi topo -m")
if output is None:
output = run_and_read_all(run_lambda, "rocm-smi --showtopo")
return output
def get_cpu_info(run_lambda):
rc, out, err = 0, "", ""
if get_platform() == "linux":
rc, out, err = run_lambda("lscpu")
elif get_platform() == "win32":
rc, out, err = run_lambda(
"wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \
CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE"
)
elif get_platform() == "darwin":
rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string")
cpu_info = "None"
if rc == 0:
cpu_info = out
else:
cpu_info = err
return cpu_info
def get_platform():
if sys.platform.startswith("linux"):
return "linux"
elif sys.platform.startswith("win32"):
return "win32"
elif sys.platform.startswith("cygwin"):
return "cygwin"
elif sys.platform.startswith("darwin"):
return "darwin"
else:
return sys.platform
def get_mac_version(run_lambda):
return run_and_parse_first_match(run_lambda, "sw_vers -productVersion", r"(.*)")
def get_windows_version(run_lambda):
system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
wmic_cmd = os.path.join(system_root, "System32", "Wbem", "wmic")
findstr_cmd = os.path.join(system_root, "System32", "findstr")
return run_and_read_all(run_lambda, "{} os get Caption | {} /v Caption".format(wmic_cmd, findstr_cmd))
def get_lsb_version(run_lambda):
return run_and_parse_first_match(run_lambda, "lsb_release -a", r"Description:\t(.*)")
def check_release_file(run_lambda):
return run_and_parse_first_match(run_lambda, "cat /etc/*-release", r'PRETTY_NAME="(.*)"')
def get_os(run_lambda):
from platform import machine
platform = get_platform()
if platform == "win32" or platform == "cygwin":
return get_windows_version(run_lambda)
if platform == "darwin":
version = get_mac_version(run_lambda)
if version is None:
return None
return "macOS {} ({})".format(version, machine())
if platform == "linux":
# Ubuntu/Debian based
desc = get_lsb_version(run_lambda)
if desc is not None:
return "{} ({})".format(desc, machine())
# Try reading /etc/*-release
desc = check_release_file(run_lambda)
if desc is not None:
return "{} ({})".format(desc, machine())
return "{} ({})".format(platform, machine())
# Unknown platform
return platform
def get_python_platform():
import platform
return platform.platform()
def get_libc_version():
import platform
if get_platform() != "linux":
return "N/A"
return "-".join(platform.libc_ver())
def is_uv_venv():
if os.environ.get("UV"):
return True
pyvenv_cfg_path = os.path.join(sys.prefix, "pyvenv.cfg")
if os.path.exists(pyvenv_cfg_path):
with open(pyvenv_cfg_path, "r") as f:
return any(line.startswith("uv = ") for line in f)
return False
def get_pip_packages(run_lambda, patterns=None):
"""Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages."""
if patterns is None:
patterns = DEFAULT_PIP_PATTERNS
def run_with_pip():
try:
import importlib.util
pip_spec = importlib.util.find_spec("pip")
pip_available = pip_spec is not None
except ImportError:
pip_available = False
if pip_available:
cmd = [sys.executable, "-mpip", "list", "--format=freeze"]
elif is_uv_venv():
print("uv is set")
cmd = ["uv", "pip", "list", "--format=freeze"]
else:
raise RuntimeError("Could not collect pip list output (pip or uv module not available)")
out = run_and_read_all(run_lambda, cmd)
return "\n".join(line for line in out.splitlines() if any(name in line for name in patterns))
pip_version = "pip3" if sys.version[0] == "3" else "pip"
out = run_with_pip()
return pip_version, out
def get_cachingallocator_config():
ca_config = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")
return ca_config
def get_cuda_module_loading_config():
if TORCH_AVAILABLE and torch.cuda.is_available():
torch.cuda.init()
config = os.environ.get("CUDA_MODULE_LOADING", "")
return config
else:
return "N/A"
def is_xnnpack_available():
if TORCH_AVAILABLE:
import torch.backends.xnnpack
return str(torch.backends.xnnpack.enabled) # type: ignore[attr-defined]
else:
return "N/A"
def get_env_vars():
env_vars = ""
secret_terms = ("secret", "token", "api", "access", "password")
report_prefix = (
"TORCH",
"NCCL",
"PYTORCH",
"CUDA",
"CUBLAS",
"CUDNN",
"OMP_",
"MKL_",
"NVIDIA",
)
for k, v in os.environ.items():
if any(term in k.lower() for term in secret_terms):
continue
if k in environment_variables:
env_vars = env_vars + "{}={}".format(k, v) + "\n"
if k.startswith(report_prefix):
env_vars = env_vars + "{}={}".format(k, v) + "\n"
return env_vars
def get_env_info():
run_lambda = run
pip_version, pip_list_output = get_pip_packages(run_lambda)
if TORCH_AVAILABLE:
version_str = torch.__version__
debug_mode_str = str(torch.version.debug)
cuda_available_str = str(torch.cuda.is_available())
cuda_version_str = torch.version.cuda
if not hasattr(torch.version, "hip") or torch.version.hip is None: # cuda version
hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A"
else: # HIP version
def get_version_or_na(cfg, prefix):
_lst = [s.rsplit(None, 1)[-1] for s in cfg if prefix in s]
return _lst[0] if _lst else "N/A"
cfg = torch._C._show_config().split("\n")
hip_runtime_version = get_version_or_na(cfg, "HIP Runtime")
miopen_runtime_version = get_version_or_na(cfg, "MIOpen")
cuda_version_str = "N/A"
hip_compiled_version = torch.version.hip
else:
version_str = debug_mode_str = cuda_available_str = cuda_version_str = "N/A"
hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A"
sys_version = sys.version.replace("\n", " ")
conda_packages = get_conda_packages(run_lambda)
rocm_version = get_rocm_version(run_lambda)
vllm_version = get_vllm_version()
vllm_omni_version = get_vllm_omni_version(run_lambda)
vllm_build_flags = summarize_vllm_build_flags()
gpu_topo = get_gpu_topo(run_lambda)
return SystemEnv(
torch_version=version_str,
is_debug_build=debug_mode_str,
python_version="{} ({}-bit runtime)".format(sys_version, sys.maxsize.bit_length() + 1),
python_platform=get_python_platform(),
is_cuda_available=cuda_available_str,
cuda_compiled_version=cuda_version_str,
cuda_runtime_version=get_running_cuda_version(run_lambda),
cuda_module_loading=get_cuda_module_loading_config(),
nvidia_gpu_models=get_gpu_info(run_lambda),
nvidia_driver_version=get_nvidia_driver_version(run_lambda),
cudnn_version=get_cudnn_version(run_lambda),
hip_compiled_version=hip_compiled_version,
hip_runtime_version=hip_runtime_version,
miopen_runtime_version=miopen_runtime_version,
pip_version=pip_version,
pip_packages=pip_list_output,
conda_packages=conda_packages,
os=get_os(run_lambda),
libc_version=get_libc_version(),
gcc_version=get_gcc_version(run_lambda),
clang_version=get_clang_version(run_lambda),
cmake_version=get_cmake_version(run_lambda),
caching_allocator_config=get_cachingallocator_config(),
is_xnnpack_available=is_xnnpack_available(),
cpu_info=get_cpu_info(run_lambda),
rocm_version=rocm_version,
vllm_version=vllm_version,
vllm_omni_version=vllm_omni_version,
vllm_build_flags=vllm_build_flags,
gpu_topo=gpu_topo,
env_vars=get_env_vars(),
)
env_info_fmt = """
==============================
System Info
==============================
OS : {os}
GCC version : {gcc_version}
Clang version : {clang_version}
CMake version : {cmake_version}
Libc version : {libc_version}
==============================
PyTorch Info
==============================
PyTorch version : {torch_version}
Is debug build : {is_debug_build}
CUDA used to build PyTorch : {cuda_compiled_version}
ROCM used to build PyTorch : {hip_compiled_version}
==============================
Python Environment
==============================
Python version : {python_version}
Python platform : {python_platform}
==============================
CUDA / GPU Info
==============================
Is CUDA available : {is_cuda_available}
CUDA runtime version : {cuda_runtime_version}
CUDA_MODULE_LOADING set to : {cuda_module_loading}
GPU models and configuration : {nvidia_gpu_models}
Nvidia driver version : {nvidia_driver_version}
cuDNN version : {cudnn_version}
HIP runtime version : {hip_runtime_version}
MIOpen runtime version : {miopen_runtime_version}
Is XNNPACK available : {is_xnnpack_available}
==============================
CPU Info
==============================
{cpu_info}
==============================
Versions of relevant libraries
==============================
{pip_packages}
{conda_packages}
""".strip()
# both the above code and the following code use `strip()` to
# remove leading/trailing whitespaces, so we need to add a newline
# in between to separate the two sections
env_info_fmt += "\n\n"
env_info_fmt += """
==============================
vLLM Info
==============================
ROCM Version : {rocm_version}
vLLM Version : {vllm_version}
vLLM-Omni Version : {vllm_omni_version}
vLLM Build Flags:
{vllm_build_flags}
GPU Topology:
{gpu_topo}
==============================
Environment Variables
==============================
{env_vars}
""".strip()
def pretty_str(envinfo):
def replace_nones(dct, replacement="Could not collect"):
for key in dct.keys():
if dct[key] is not None:
continue
dct[key] = replacement
return dct
def replace_bools(dct, true="Yes", false="No"):
for key in dct.keys():
if dct[key] is True:
dct[key] = true
elif dct[key] is False:
dct[key] = false
return dct
def prepend(text, tag="[prepend]"):
lines = text.split("\n")
updated_lines = [tag + line for line in lines]
return "\n".join(updated_lines)
def replace_if_empty(text, replacement="No relevant packages"):
if text is not None and len(text) == 0:
return replacement
return text
def maybe_start_on_next_line(string):
# If `string` is multiline, prepend a \n to it.
if string is not None and len(string.split("\n")) > 1:
return "\n{}\n".format(string)
return string
mutable_dict = envinfo._asdict()
# If nvidia_gpu_models is multiline, start on the next line
mutable_dict["nvidia_gpu_models"] = maybe_start_on_next_line(envinfo.nvidia_gpu_models)
# If the machine doesn't have CUDA, report some fields as 'No CUDA'
dynamic_cuda_fields = [
"cuda_runtime_version",
"nvidia_gpu_models",
"nvidia_driver_version",
]
all_cuda_fields = dynamic_cuda_fields + ["cudnn_version"]
all_dynamic_cuda_fields_missing = all(mutable_dict[field] is None for field in dynamic_cuda_fields)
if TORCH_AVAILABLE and not torch.cuda.is_available() and all_dynamic_cuda_fields_missing:
for field in all_cuda_fields:
mutable_dict[field] = "No CUDA"
if envinfo.cuda_compiled_version is None:
mutable_dict["cuda_compiled_version"] = "None"
# Replace True with Yes, False with No
mutable_dict = replace_bools(mutable_dict)
# Replace all None objects with 'Could not collect'
mutable_dict = replace_nones(mutable_dict)
# If either of these are '', replace with 'No relevant packages'
mutable_dict["pip_packages"] = replace_if_empty(mutable_dict["pip_packages"])
mutable_dict["conda_packages"] = replace_if_empty(mutable_dict["conda_packages"])
# Tag conda and pip packages with a prefix
# If they were previously None, they'll show up as ie '[conda] Could not collect'
if mutable_dict["pip_packages"]:
mutable_dict["pip_packages"] = prepend(mutable_dict["pip_packages"], "[{}] ".format(envinfo.pip_version))
if mutable_dict["conda_packages"]:
mutable_dict["conda_packages"] = prepend(mutable_dict["conda_packages"], "[conda] ")
mutable_dict["cpu_info"] = envinfo.cpu_info
return env_info_fmt.format(**mutable_dict)
def get_pretty_env_info():
return pretty_str(get_env_info())
def main():
print("Collecting environment information...")
output = get_pretty_env_info()
print(output)
if TORCH_AVAILABLE and hasattr(torch, "utils") and hasattr(torch.utils, "_crash_handler"):
minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR
if sys.platform == "linux" and os.path.exists(minidump_dir):
dumps = [os.path.join(minidump_dir, dump) for dump in os.listdir(minidump_dir)]
latest = max(dumps, key=os.path.getctime)
ctime = os.path.getctime(latest)
creation_time = datetime.datetime.fromtimestamp(ctime).strftime("%Y-%m-%d %H:%M:%S")
msg = (
"\n*** Detected a minidump at {} created on {}, ".format(latest, creation_time)
+ "if this is related to your bug please include it when you file a report ***"
)
print(msg, file=sys.stderr)
if __name__ == "__main__":
main()
ARG VLLM_BASE_IMAGE=vllm/vllm-openai
ARG VLLM_BASE_TAG=v0.15.0
FROM ${VLLM_BASE_IMAGE}:${VLLM_BASE_TAG}
ARG APP_DIR=/workspace/vllm-omni
WORKDIR ${APP_DIR}
COPY . .
# Install system dependencies
RUN apt-get update && \
apt-get install -y ffmpeg && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Install vllm-omni into the same uv-managed Python environment used by the base image.
RUN uv pip install --python "$(python3 -c 'import sys; print(sys.executable)')" --no-cache-dir ".[dev]"
RUN ln -sf /usr/bin/python3 /usr/bin/python
ENTRYPOINT []
ARG VLLM_ASCEND_IMAGE=quay.nju.edu.cn/ascend/vllm-ascend
ARG VLLM_ASCEND_TAG=v0.11.0rc2
FROM ${VLLM_ASCEND_IMAGE}:${VLLM_ASCEND_TAG}
ARG APP_DIR=/vllm-workspace/vllm-omni
WORKDIR ${APP_DIR}
COPY . .
# Install vllm-omni with dev dependencies
RUN pip install --no-cache-dir -e ".[dev]"
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENTRYPOINT []
ARG BASE_IMAGE=vllm/vllm-openai-rocm:v0.15.0
FROM ${BASE_IMAGE} AS final
ARG COMMON_WORKDIR=/app
WORKDIR ${COMMON_WORKDIR}
# Step 1: Setup - Install system dependencies
RUN apt-get update && \
apt-get install -y ffmpeg && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN mkdir -p ${COMMON_WORKDIR}/vllm-omni
# Step 2: Copy vllm-omni code and install without uv
COPY . ${COMMON_WORKDIR}/vllm-omni
RUN cd ${COMMON_WORKDIR}/vllm-omni && uv pip install --python "$(python3 -c 'import sys; print(sys.executable)')" --no-cache-dir ".[dev]"
# When we are installing onnxruntime-rocm, we need to uninstall the system-installed onnxruntime first.
# These are the dependencies of Qwen3-TTS.
RUN uv pip uninstall onnxruntime --system && uv pip install --no-cache-dir onnxruntime-rocm sox --system
RUN ln -sf /usr/bin/python3 /usr/bin/python
CMD ["/bin/bash"]
ENTRYPOINT []
#Set entrypoint for vllm-openai official images
FROM final AS vllm-openai
ENTRYPOINT ["vllm", "serve", "--omni"]
nav:
- Home: README.md
- User Guide:
- Getting Started:
- getting_started/quickstart.md
- getting_started/installation/*
- Serving:
- OpenAI-Compatible API:
- Image Generation: serving/image_generation_api.md
- Image Edit: serving/image_edit_api.md
- Examples:
- examples/README.md
- Offline Inference:
- Image-To-Image: user_guide/examples/offline_inference/image_to_image.md
- Image-To-Video: user_guide/examples/offline_inference/image_to_video.md
- Qwen2.5-Omni: user_guide/examples/offline_inference/qwen2_5_omni.md
- Qwen3-Omni: user_guide/examples/offline_inference/qwen3_omni.md
- Qwen3-TTS Offline Inference: user_guide/examples/offline_inference/qwen3_tts.md
- Text-To-Image: user_guide/examples/offline_inference/text_to_image.md
- Text-To-Video: user_guide/examples/offline_inference/text_to_video.md
- Online Serving:
- Image-To-Image: user_guide/examples/online_serving/image_to_image.md
- Qwen2.5-Omni: user_guide/examples/online_serving/qwen2_5_omni.md
- Qwen3-Omni: user_guide/examples/online_serving/qwen3_omni.md
- Text-To-Image: user_guide/examples/online_serving/text_to_image.md
- General:
- usage/*
- Configuration:
- configuration/README.md
- configuration/*
- Models:
- models/supported_models.md
- Features:
- Sleep Mode: features/sleep_mode.md
- Diffusion Features:
- Overview: user_guide/diffusion_acceleration.md
- TeaCache: user_guide/diffusion/teacache.md
- Cache-DiT: user_guide/diffusion/cache_dit_acceleration.md
- Parallelism Acceleration: user_guide/diffusion/parallelism_acceleration.md
- CPU Offloading: user_guide/diffusion/cpu_offload_diffusion.md
- Developer Guide:
- General:
- contributing/README.md
- glob: contributing/*
flatten_single_child_sections: true
- Model Implementation:
- contributing/model/README.md
- contributing/model/adding_omni_model.md
- contributing/model/adding_diffusion_model.md
- CI: contributing/ci
- Design Documents:
- design/index.md
- design/architecture_overview.md
- Feature Design:
- design/feature/disaggregated_inference.md
- design/feature/ray_based_execution.md
- Module Design:
- design/module/ar_module.md
- design/module/dit_module.md
- design/module/entrypoint_module.md
- Docs Guide: contributing/DOCS_GUIDE.md
- API Reference:
- api/README.md
- api/vllm_omni
- CLI Reference: cli
- Community:
- community/*
- Slack: https://slack.vllm.ai
- Blog: https://blog.vllm.ai
- Forum: https://discuss.vllm.ai
---
hide:
- navigation
- toc
---
# Welcome to vLLM-Omni
<p align="center">
<picture>
<source media="(prefers-color-scheme: dark)" src="./source/logos/vllm-omni-logo.png">
<img alt="vllm-omni" src="./source/logos/vllm-omni-logo.png" width=55%>
</picture>
</p>
<h3 align="center">
Easy, fast, and cheap omni-modality model serving for everyone
</h3>
<p style="text-align:center">
<script async defer src="https://buttons.github.io/buttons.js"></script>
<a class="github-button" href="https://github.com/vllm-project/vllm-omni" data-show-count="true" data-size="large" aria-label="Star">Star</a>
<a class="github-button" href="https://github.com/vllm-project/vllm-omni/subscription" data-show-count="true" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
<a class="github-button" href="https://github.com/vllm-project/vllm-omni/fork" data-show-count="true" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
</p>
## About
[vLLM](https://github.com/vllm-project/vllm) was originally designed to support large language models for text-based autoregressive generation tasks. vLLM-Omni is a framework that extends its support for omni-modality model inference and serving:
- **Omni-modality**: Text, image, video, and audio data processing
- **Non-autoregressive Architectures**: extend the AR support of vLLM to Diffusion Transformers (DiT) and other parallel generation models
- **Heterogeneous outputs**: from traditional text generation to multimodal outputs
<p align="center">
<picture>
<source media="(prefers-color-scheme: dark)" src="./source/architecture/omni-modality-model-architecture.png">
<img alt="vllm-omni-arch" src="./source/architecture/omni-modality-model-architecture.png" width=55%>
</picture>
</p>
vLLM-Omni is fast with:
- State-of-the-art AR support by leveraging efficient KV cache management from vLLM
- Pipelined stage execution overlapping for high throughput performance
- Fully disaggregation based on OmniConnector and dynamic resource allocation across stages
vLLM-Omni is flexible and easy to use with:
- Heterogeneous pipeline abstraction to manage complex model workflows
- Seamless integration with popular Hugging Face models
- Tensor, pipeline, data and expert parallelism support for distributed inference
- Streaming outputs
- OpenAI-compatible API server
vLLM-Omni seamlessly supports most popular open-source models on HuggingFace, including:
- Omni-modality models (e.g. Qwen2.5-Omni, Qwen3-Omni)
- Multi-modality generation models (e.g. Qwen-Image)
For more information, checkout the following:
- [vllm-omni architecture design and recent roadmaps](https://docs.google.com/presentation/d/1qv4qMW1rKAqDREMXiUDLIgqqHQe7TDPj/edit?usp=sharing&ouid=110473603432222024453&rtpof=true&sd=true)
- [vllm-omni announcement blogpost](https://blog.vllm.ai/2025/11/30/vllm-omni.html)
# Summary
## Entry Points
Main entry points for vLLM-Omni inference and serving.
- [vllm_omni.entrypoints.async_omni.AsyncOmni][]
- [vllm_omni.entrypoints.async_omni_diffusion.AsyncOmniDiffusion][]
- [vllm_omni.entrypoints.async_omni_llm.AsyncOmniLLM][]
- [vllm_omni.entrypoints.chat_utils.OmniAsyncMultiModalContentParser][]
- [vllm_omni.entrypoints.chat_utils.OmniAsyncMultiModalItemTracker][]
- [vllm_omni.entrypoints.chat_utils.parse_chat_messages_futures][]
- [vllm_omni.entrypoints.cli.serve.OmniServeCommand][]
- [vllm_omni.entrypoints.client_request_state.ClientRequestState][]
- [vllm_omni.entrypoints.log_utils.OrchestratorMetrics][]
- [vllm_omni.entrypoints.log_utils.StageRequestMetrics][]
- [vllm_omni.entrypoints.log_utils.StageStats][]
- [vllm_omni.entrypoints.omni.Omni][]
- [vllm_omni.entrypoints.omni.OmniBase][]
- [vllm_omni.entrypoints.omni_diffusion.OmniDiffusion][]
- [vllm_omni.entrypoints.omni_llm.OmniLLM][]
- [vllm_omni.entrypoints.omni_stage.OmniStage][]
- [vllm_omni.entrypoints.stage_utils.OmniStageTaskType][]
## Inputs
Input data structures for multi-modal inputs.
- [vllm_omni.inputs.data.OmniEmbedsPrompt][]
- [vllm_omni.inputs.data.OmniTokenInputs][]
- [vllm_omni.inputs.data.OmniTokensPrompt][]
- [vllm_omni.inputs.parse.parse_singleton_prompt_omni][]
- [vllm_omni.inputs.preprocess.OmniInputPreprocessor][]
## Engine
Engine classes for offline and online inference.
- [vllm_omni.diffusion.diffusion_engine.DiffusionEngine][]
- [vllm_omni.engine.AdditionalInformationEntry][]
- [vllm_omni.engine.AdditionalInformationPayload][]
- [vllm_omni.engine.OmniEngineCoreOutput][]
- [vllm_omni.engine.OmniEngineCoreOutputs][]
- [vllm_omni.engine.OmniEngineCoreRequest][]
- [vllm_omni.engine.PromptEmbedsPayload][]
- [vllm_omni.engine.arg_utils.AsyncOmniEngineArgs][]
- [vllm_omni.engine.arg_utils.OmniEngineArgs][]
- [vllm_omni.engine.input_processor.OmniInputProcessor][]
- [vllm_omni.engine.output_processor.MultimodalOutputProcessor][]
- [vllm_omni.engine.output_processor.OmniRequestState][]
## Core
Core scheduling and caching components.
- [vllm_omni.core.sched.omni_ar_scheduler.KVCacheTransferData][]
- [vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler][]
- [vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler][]
- [vllm_omni.core.sched.output.OmniCachedRequestData][]
- [vllm_omni.core.sched.output.OmniNewRequestData][]
- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.DistributedGroupResidualVectorQuantization][]
- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.DistributedResidualVectorQuantization][]
- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.EuclideanCodebook][]
- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.VectorQuantization][]
- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.preprocess][]
## Configuration
Configuration classes.
- [vllm_omni.config.model.OmniModelConfig][]
- [vllm_omni.diffusion.cache.teacache.config.TeaCacheConfig][]
- [vllm_omni.distributed.omni_connectors.utils.config.ConnectorSpec][]
- [vllm_omni.distributed.omni_connectors.utils.config.OmniTransferConfig][]
- [vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts.Qwen3TTSConfig][]
- [vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts.Qwen3TTSSpeakerEncoderConfig][]
- [vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts.Qwen3TTSTalkerCodePredictorConfig][]
- [vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts.Qwen3TTSTalkerConfig][]
- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_12hz.configuration_qwen3_tts_tokenizer_v2.Qwen3TTSTokenizerV2Config][]
- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_12hz.configuration_qwen3_tts_tokenizer_v2.Qwen3TTSTokenizerV2DecoderConfig][]
- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1Config][]
- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1DecoderBigVGANConfig][]
- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1DecoderConfig][]
- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1DecoderDiTConfig][]
- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1EncoderConfig][]
## Workers
Worker classes and model runners for distributed inference.
- [vllm_omni.diffusion.worker.gpu_diffusion_model_runner.GPUDiffusionModelRunner][]
- [vllm_omni.diffusion.worker.gpu_diffusion_worker.GPUDiffusionWorker][]
- [vllm_omni.diffusion.worker.gpu_diffusion_worker.WorkerProc][]
- [vllm_omni.diffusion.worker.npu.npu_worker.NPUWorker][]
- [vllm_omni.diffusion.worker.npu.npu_worker.NPUWorkerProc][]
- [vllm_omni.worker.gpu_ar_model_runner.ExecuteModelState][]
- [vllm_omni.worker.gpu_ar_model_runner.GPUARModelRunner][]
- [vllm_omni.worker.gpu_ar_worker.GPUARWorker][]
- [vllm_omni.worker.gpu_generation_model_runner.GPUGenerationModelRunner][]
- [vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker][]
- [vllm_omni.worker.gpu_model_runner.OmniGPUModelRunner][]
- [vllm_omni.worker.npu.npu_ar_model_runner.ExecuteModelState][]
- [vllm_omni.worker.npu.npu_ar_model_runner.NPUARModelRunner][]
- [vllm_omni.worker.npu.npu_ar_worker.NPUARWorker][]
- [vllm_omni.worker.npu.npu_generation_model_runner.NPUGenerationModelRunner][]
- [vllm_omni.worker.npu.npu_generation_worker.NPUGenerationWorker][]
- [vllm_omni.worker.npu.npu_model_runner.OmniNPUModelRunner][]
# vLLM-Omni CLI Guide
The CLI for vLLM-Omni inherits from vllm with some additional arguments.
## serve
Starts the vLLM-Omni OpenAI Compatible API server.
Start with a model:
```bash
vllm serve Qwen/Qwen2.5-Omni-7B --omni
```
Specify the port:
```bash
vllm serve Qwen/Qwen2.5-Omni-7B --omni --port 8091
```
If you have custom stage configs file, launch the server with command below
```bash
vllm serve Qwen/Qwen2.5-Omni-7B --omni --stage-configs-path /path/to/stage_configs_file
```
## bench
Run benchmark tests for online serving throughput.
Available Commands:
```bash
vllm bench serve --omni \
--model Qwen/Qwen2.5-Omni-7B \
--host server-host \
--port server-port \
--random-input-len 32 \
--random-output-len 4 \
--num-prompts 5
```
See [vllm bench serve](./bench/serve.md) for the full reference of all available arguments.
# vLLM-Omni Benchmark CLI Guide
The vllm bench command launches the vLLM-Omni benchmark to evaluate the performance of multimodal models.
## Notes
We currently only support using the "openai-chat-omni" backend.
## Basic Parameter Description
You can use `vllm bench serve --omni --help=all` to get descriptions of all parameters. The commonly used parameters are described below:
- `--omni`
Enable Omni (multimodal) mode, supporting multimodal inputs and outputs such as images, videos, and audio.
- `--backend`
Specify the backend adapter as openai-chat-omni, using OpenAI Chat compatible API behavior as the protocol. Currently only openai-chat-omni is supported.
- `--model`
The model identifier to load, filled according to the models supported by vLLM-Omni.
- `--endpoint`
The API endpoint exposed externally, to which clients send their requests.
- `--dataset-name`
The name of the dataset used; random-mm indicates generating random multimodal inputs (images, videos, audio).
- `--num-prompts`
The total number of requests to send, an integer.
- `--max-concurrency`
"Maximum number of concurrent requests. This can be used "
"to help simulate an environment where a higher level component "
"is enforcing a maximum number of concurrent requests. While the "
"--request-rate argument controls the rate at which requests are "
"initiated, this argument will control how many are actually allowed "
"to execute at a time. This means that when used in combination, the "
"actual request rate may be lower than specified with --request-rate, "
"if the server is not processing requests fast enough to keep up."
- `--request-rate`
"Number of requests per second. If this is inf, "
"then all the requests are sent at time 0. "
"Otherwise, we use Poisson process or gamma distribution "
"to synthesize the request arrival times."
- `--ignore-eos`
"Set ignore_eos flag when sending the benchmark request."
- `--metric-percentiles`
Comma-separated list of percentiles for selected metrics. "
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
"Default value is \"99\"."
"Use \"--percentile-metrics\" to select metrics.
- `--percentile-metrics`
"Comma-separated list of selected metrics to report percentiles."
"This argument specifies the metrics to report percentiles."
'Allowed metric names are "ttft", "tpot", "itl", "e2el", "audio_ttfp", "audio_rtf". '
- `--save-result`
Specify to save benchmark results to a json file
- `--save-detailed`
"When saving the results, whether to include per request "
"information such as response, error, ttfs, tpots, etc."
- `--result-dir`
"Specify directory to save benchmark json results."
"If not specified, results are saved in the current directory."
- `--result-filename`
"Specify the filename to save benchmark json results."
"If not specified, results will be saved in "
"{label}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
- `--random-prefix-len`
Number of fixed prefix tokens before the random context in a request.
The total input length is the sum of random-prefix-len and a random
context length sampled from [input_len * (1 - range_ratio),
input_len * (1 + range_ratio)].Only the random and random-mm modes
support this parameter.
- `--random-input-len`
Number of input tokens per request.Only the random and random-mm modes support this parameter.
- `--random-output-len`
Number of output tokens per request.Only the random and random-mm modes support this parameter.
- `--random-range-ratio`
Range ratio for sampling input/output length,
used only for random sampling. Must be in the range [0, 1) to define
a symmetric sampling range
[length * (1 - range_ratio), length * (1 + range_ratio)].
Only the random and random-mm modes support this parameter.
- `--random-mm-base-items-per-request`
Base number of multimodal items per request for random-mm.
Actual per-request count is sampled around this base using
--random-mm-num-mm-items-range-ratio.
Only the random-mm mode supports this parameter.
- `--random-mm-limit-mm-per-prompt`
Per-modality hard caps for items attached per request, e.g.
'{"image": 3, "video": 1, "audio": 1}'. The sampled per-request item
count is clamped to the sum of these limits. When a modality
reaches its cap, its buckets are excluded and probabilities are
renormalized.
Only the random-mm mode supports this parameter.
- `--random-mm-num-mm-items-range-ratio`
Range ratio r in [0, 1] for sampling items per request.
We sample uniformly from the closed integer range
[floor(n*(1-r)), ceil(n*(1+r))]
where n is the base items per request.
r=0 keeps it fixed; r=1 allows 0 items. The maximum is clamped
to the sum of per-modality limits from
--random-mm-limit-mm-per-prompt.
An error is raised if the computed min exceeds the max.
Only the random-mm mode supports this parameter.
- `--random-mm-bucket-config`
The bucket config is a dictionary mapping a multimodal item
sampling configuration to a probability.
Currently allows for 3 modalities: audio, images and videos.
A bucket key is a tuple of (height, width, num_frames)
The value is the probability of sampling that specific item.
Example:
--random-mm-bucket-config
"{(256, 256, 1): 0.5, (720, 1280, 16): 0.4, (0, 1, 5): 0.10}"
First item: images with resolution 256x256 w.p. 0.5
Second item: videos with resolution 720x1280 and 16 frames
Third item: audios with 1s duration and 5 channels w.p. 0.1
OBS.: If the probabilities do not sum to 1, they are normalized.
Only the random-mm mode supports this parameter
## Usage Examples
### Online Benchmark
<details class="admonition abstract" markdown="1">
<summary>Show more</summary>
First start serving your model:
```bash
vllm serve Qwen/Qwen2.5-Omni-7B --omni
```
Then run the benchmarking for sharegpt:
```bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
vllm bench serve \
--omni \
--port 43845 \
--model /home/models/Qwen/Qwen3-Omni-30B-A3B-Instruct \
--endpoint /v1/chat/completions \
--backend openai-chat-omni \
--num-prompts 2 \
--dataset-name sharegpt \
--dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \
--percentile-metrics ttft,tpot,itl,e2el
```
If successful, you will see the following output:
```text
============ Serving Benchmark Result ============
Successful requests: 2
Failed requests: 0
Benchmark duration (s): 81.63
Request throughput (req/s): 0.02
Peak concurrent requests: 2.00
----------------End-to-end Latency----------------
Mean E2EL (ms): 56966.13
Median E2EL (ms): 56966.13
P99 E2EL (ms): 81016.80
================== Text Result ===================
Total input tokens: 36
Total generated tokens: 5926
Output token throughput (tok/s): 72.60
Peak output token throughput (tok/s): 103.00
Peak concurrent requests: 2.00
Total Token throughput (tok/s): 73.04
---------------Time to First Token----------------
Mean TTFT (ms): 124.76
Median TTFT (ms): 124.76
P99 TTFT (ms): 156.10
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 481.30
Median TPOT (ms): 481.30
P99 TPOT (ms): 947.55
---------------Inter-token Latency----------------
Mean ITL (ms): 25.11
Median ITL (ms): 0.33
P99 ITL (ms): 25.17
================== Audio Result ==================
Total audio duration generated(s): 3.95
Total audio frames generated: 94890
Audio throughput(audio duration/s): 0.05
==================================================
```
Or run the benchmarking for random:
```bash
vllm bench serve \
--omni \
--port 43845 \
--endpoint /v1/chat/completions \
--backend openai-chat-omni \
--model /home/models/Qwen/Qwen3-Omni-30B-A3B-Instruct \
--dataset-name random \
--num-prompts 2 \
--random-prefix-len 5 \
--random-input-len 10 \
--random-output-len 100 \
--percentile-metrics ttft,tpot,itl,e2el,audio_ttfp,audio_rtf \
--ignore-eos
```
If successful, you will see the following output:
```text
============ Serving Benchmark Result ============
Successful requests: 2
Failed requests: 0
Benchmark duration (s): 24.35
Request throughput (req/s): 0.08
Peak concurrent requests: 2.00
----------------End-to-end Latency----------------
Mean E2EL (ms): 22576.23
Median E2EL (ms): 22576.23
P99 E2EL (ms): 24205.72
================== Text Result ===================
Total input tokens: 30
Total generated tokens: 8973
Output token throughput (tok/s): 368.52
Peak output token throughput (tok/s): 81.00
Peak concurrent requests: 2.00
Total Token throughput (tok/s): 369.76
---------------Time to First Token----------------
Mean TTFT (ms): 125.16
Median TTFT (ms): 125.16
P99 TTFT (ms): 155.88
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 5.01
Median TPOT (ms): 5.01
P99 TPOT (ms): 5.42
---------------Inter-token Latency----------------
Mean ITL (ms): 34.15
Median ITL (ms): 0.01
P99 ITL (ms): 376.19
================== Audio Result ==================
Total audio duration generated(s): 3.95
Total audio frames generated: 94890
Audio throughput(audio duration/s): 0.16
---------------Time to First Packet---------------
Mean AUDIO_TTFP (ms): 11756.89
Median AUDIO_TTFP (ms): 11756.89
P99 AUDIO_TTFP (ms): 20854.25
-----------------Real Time Factor-----------------
Mean AUDIO_RTF: 3.75
Median AUDIO_RTF: 3.75
P99 AUDIO_RTF: 7.39
==================================================
```
Notes:
We use (audio generation time - first packet latency) / audio duration to calculate RTF.
</details>
### Multi-Modal Benchmark
<details class="admonition abstract" markdown="1">
<summary>Show more</summary>
Benchmark the performance of multi-modal requests in vLLM-Omni.
Generate synthetic image、video、audio inputs alongside random text prompts to stress-test vision models without external datasets.
Notes:
- Works only with online benchmark via the OpenAI backend (`--backend openai-chat-omni`) and endpoint `/v1/chat/completions`.
Start the server (example):
```bash
vllm serve Qwen/Qwen2.5-Omni-7B --omni
```
It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`.
Then run the benchmarking script:
```bash
vllm bench serve \
--omni \
--dataset-name random-mm \
--port 40849 \
--model /home/models/Qwen/Qwen3-Omni-30B-A3B-Instruct \
--endpoint /v1/chat/completions \
--backend openai-chat-omni \
--request-rate 1 \
--num-prompts 1 \
--random-input-len 10 \
--random-range-ratio 0.0 \
--random-mm-base-items-per-request 2 \
--random-mm-num-mm-items-range-ratio 0 \
--random-mm-limit-mm-per-prompt '{"image":1,"video":1, "audio": 1}' \
--random-mm-bucket-config '{"(32, 32, 1)": 0.5, "(0, 1, 1)": 0.1, "(32, 32, 2)":0.4}' \
--ignore-eos \
--percentile-metrics ttft,tpot,itl \
--random-output-len 2 \
--extra_body '{"modalities": ["text"]}'
```
If successful, you will see the following output:
```text
============ Serving Benchmark Result ============
Successful requests: 1
Failed requests: 0
Request rate configured (RPS): 1.00
Benchmark duration (s): 1.21
Request throughput (req/s): 0.83
Peak concurrent requests: 1.00
================== Text Result ===================
Total input tokens: 10
Total generated tokens: 3
Output token throughput (tok/s): 2.49
Peak output token throughput (tok/s): 3.00
Peak concurrent requests: 1.00
Total Token throughput (tok/s): 10.77
---------------Time to First Token----------------
Mean TTFT (ms): 179.74
Median TTFT (ms): 179.74
P99 TTFT (ms): 179.74
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 12.76
Median TPOT (ms): 12.76
P99 TPOT (ms): 12.76
---------------Inter-token Latency----------------
Mean ITL (ms): 12.76
Median ITL (ms): 12.76
P99 ITL (ms): 25.24
================== Audio Result ==================
Total audio duration generated(s): 0.00
Total audio frames generated: 0
Audio throughput(audio duration/s): 0.00
==================================================
```
Behavioral notes:
- If the requested base item count cannot be satisfied under the provided per-prompt limits, the tool raises an error rather than silently clamping.
How sampling works:
- Determine per-request item count k by sampling uniformly from the integer range defined by `--random-mm-base-items-per-request` and `--random-mm-num-mm-items-range-ratio`, then clamp k to at most the sum of per-modality limits.
- For each of the k items, sample a bucket (H, W, T) according to the normalized probabilities in `--random-mm-bucket-config`, while tracking how many items of each modality have been added.
- If a modality (e.g., image) reaches its limit from `--random-mm-limit-mm-per-prompt`, all buckets of that modality are excluded and the remaining bucket probabilities are renormalized before continuing.
This should be seen as an edge case, and if this behavior can be avoided by setting `--random-mm-limit-mm-per-prompt` to a large number. Note that this might result in errors due to engine config `--limit-mm-per-prompt`.
- The resulting request contains synthetic image data in `multi_modal_data` (OpenAI Chat format). When `random-mm` is used with the OpenAI Chat backend, prompts remain text and MM content is attached via `multi_modal_data`.
</details>
# Contact Us
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm-omni/issues)
- For coordinating contributions and development and discussing with other users and developers, please join `sig-omni` channel in our [Slack](https://slack.vllm.ai/) or use the [vLLM Forum](https://discuss.vllm.ai/)
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm-omni/security/advisories) feature
# Volunteers for Bugfix and CI
We encourage you to check current docs and [issues](https://github.com/vllm-project/vllm-omni/issues) to find possible solutions for your questions. If non of these can solve it, please propose an issue to describe your questions about bug or CI problems for developing.
If you have urgent need for locating and solving bugfix or CI problems, please find community volunteers below.
| Dec 4-Dec 12 | Dec 15-Dec 19 | Dec 22-Dec 26 | Dec 29- Jan 2, 2026| Jan 5-Jan 9 | Jan 12-Jan 16 |
|----------|----------|----------|----------|----------|----------|
| <a href="https://github.com/congw729">Conw729</a> | <a href="https://github.com/yinpeiqi">yinpeiqi</a> | <a href="https://github.com/tzhouam">tzhouam</a> | <a href="https://github.com/SamitHuang">SamitHuang</a> | <a href="https://github.com/gcanlin">gcanlin</a> | <a href="https://github.com/natureofnature">natureofnature</a> |
| <a href="https://github.com/david6666666">david6666666</a> | <a href="https://github.com/R2-Y">R2-Y</a> | <a href="https://github.com/hsliuustc0106">hsliuustc0106</a> | <a href="https://github.com/Gaohan123">Gaohan123</a> | <a href="https://github.com/ZJY0516">ZJY0516</a> | <a href="https://github.com/qibaoyuan">qibaoyuan</a> |
We kindly welcome more contributors to fix bugs and contribute new features!
# Configuration Options
This section lists the most common options for running vLLM-Omni.
For options within a vLLM Engine. Please refer to [vLLM Configuration](https://docs.vllm.ai/en/v0.14.0/configuration/index.html)
Currently, the main options are maintained by stage configs for each model.
For specific example, please refer to [Qwen2.5-omni stage config](stage_configs/qwen2_5_omni.yaml)
For introduction, please check [Introduction for stage config](./stage_configs.md)
## Memory Configuration
- **[GPU Memory Calculation and Configuration](./gpu_memory_utilization.md)** - Guide on how to calculate memory requirements and set up `gpu_memory_utilization` for optimal performance
## Optimization Features
- **[TeaCache Configuration](../user_guide/diffusion/teacache.md)** - Enable TeaCache adaptive caching for DiT models to achieve 1.5x-2.0x speedup with minimal quality loss
- **[Cache-DiT Configuration](../user_guide/diffusion/cache_dit_acceleration.md)** - Enable Cache-DiT as cache acceleration backends for DiT models
- **[Parallelism Configuration](../user_guide/diffusion/parallelism_acceleration.md)** - Enable parallelism (e.g., sequence parallelism) for for DiT models
# GPU Memory Calculation and Configuration
This guide explains how to calculate GPU memory requirements and properly configure `gpu_memory_utilization` for vLLM-Omni stages.
## Overview
`gpu_memory_utilization` is a critical parameter that controls how much GPU memory each stage can use. It's specified as a fraction between 0.0 and 1.0, where:
- `0.8` means 80% of the GPU's total memory
- `1.0` means 100% of the GPU's total memory (not recommended, leaves no buffer)
## How Memory is Calculated
### Memory Allocation Formula
For each stage, vLLM-Omni calculates the requested memory as:
```
requested_memory = total_gpu_memory × gpu_memory_utilization
```
The system checks that:
```
free_memory ≥ requested_memory
```
If this condition is not met, the stage will fail to initialize with an error message showing the memory requirements.
### Memory Components
The total memory used by a stage includes:
1. **Model Weights**: The size of the model parameters loaded on the GPU
2. **KV Cache**: Memory for storing key-value cache during generation
3. **Activation Memory**: Temporary memory for intermediate computations
4. **System Overhead**: Memory used by CUDA, PyTorch, and other system components
5. **Non-Torch Memory**: Memory allocated outside of PyTorch (e.g., CUDA graphs)
### Example Calculation
For a GPU with 80GB total memory:
- `gpu_memory_utilization: 0.8` → 64GB available for the stage
- `gpu_memory_utilization: 0.6` → 48GB available for the stage
- `gpu_memory_utilization: 0.15` → 12GB available for the stage
## Setting Up `gpu_memory_utilization`
### Step 1: Determine GPU Memory
First, check your GPU's total memory:
```bash
# Using nvidia-smi
nvidia-smi --query-gpu=memory.total --format=csv
# Or using Python
python -c "import torch; print(f'{torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')"
```
### Step 2: Estimate Model Memory Requirements
#### For Autoregressive (AR) Stages
AR stages typically need more memory due to:
- Large model weights
- KV cache for attention
- Activation buffers
#### For Diffusion/Generation Stages
Diffusion stages (like code2wav) typically need less memory:
- Smaller model components
- Different memory access patterns
**Typical values:**
- `0.1 - 0.3` for most diffusion stages
### Step 3: Consider Multi-Stage Scenarios
When multiple stages share the same GPU, you must ensure the sum of their `gpu_memory_utilization` values doesn't exceed 1.0.
**Example: Two stages on GPU 0**
```yaml
stage_args:
- stage_id: 0
runtime:
devices: "0"
engine_args:
gpu_memory_utilization: 0.6 # Uses 60% of GPU 0
- stage_id: 1
runtime:
devices: "0"
engine_args:
gpu_memory_utilization: 0.3 # Uses 30% of GPU 0
# Total: 90% of GPU 0 (safe, leaves 10% buffer)
```
**Important:** If stages run on different GPUs, each can use up to 1.0 independently.
### Step 4: Account for Tensor Parallelism
When using `tensor_parallel_size > 1`, the model is split across multiple GPUs, so each GPU needs less memory.
**Example: 2-way tensor parallelism**
```yaml
stage_args:
- stage_id: 0
runtime:
devices: "0,1" # Uses both GPUs
engine_args:
tensor_parallel_size: 2
gpu_memory_utilization: 0.6 # 60% per GPU
# Model is split, so each GPU uses ~30% of model memory
```
## Examples
### Qwen3-Omni-MoE on 2x H100-80GB
```yaml
stage_args:
- stage_id: 0 # Thinker stage with TP=2
runtime:
devices: "0,1"
engine_args:
tensor_parallel_size: 2
gpu_memory_utilization: 0.6 # 48GB per GPU
- stage_id: 1 # Talker stage
runtime:
devices: "1"
engine_args:
gpu_memory_utilization: 0.3 # 24GB on GPU 1
- stage_id: 2 # Code2Wav stage
runtime:
devices: "0"
engine_args:
gpu_memory_utilization: 0.1 # 8GB on GPU 0
```
**Note:** In this configuration, stages 0 and 2 share GPU 0, but they run at different times in the pipeline, so their memory usage doesn't overlap.
## Troubleshooting
### Error: "Free memory is less than desired GPU memory utilization"
This means the GPU doesn't have enough free memory when the stage starts.
**Solutions:**
1. Free up memory by closing other processes
2. Reduce `gpu_memory_utilization` for this stage
3. Use a GPU with more memory
4. Move the stage to a different GPU
### Error: OOM during inference
The stage initialized but ran out of memory during processing.
**Solutions:**
1. Reduce `max_num_batched_tokens`
2. Reduce `max_batch_size` in runtime config
3. Lower `gpu_memory_utilization` slightly
4. Enable quantization if supported
### Memory Not Fully Utilized
If you see low memory usage, you can:
1. Increase `gpu_memory_utilization` to allow larger KV cache
2. Increase `max_num_batched_tokens` for better batching
3. Check if other stages are limiting throughput
## Useful formula for Memory Calculation
### KV Cache Memory
The KV cache size depends on:
- Number of sequences in batch
- Sequence length (prompt + generation)
- Model hidden size
- Number of attention heads
- Number of layers
approximate Formula:
```
kv_cache_memory ≈ batch_size × seq_len × hidden_size × num_layers × 2 × dtype_size
```
2 for k & v
### Model Weight Memory
```
model_memory ≈ num_parameters × dtype_size
```
For example:
- 7B parameters in FP16: ~14GB
- 7B parameters in FP32: ~28GB
- 7B parameters in INT8: ~7GB
### Activation Memory
Activation memory is typically smaller but varies with:
- Batch size
- Sequence length
- Model architecture
It's usually 10-30% of model weight memory during inference.
# Stage configs for vLLM-Omni
In vLLM-Omni, the target model is separated into multiple stages, which are processed by different LLMEngines, DiffusionEngines or other types of engines. Depending on different types of stages, such as Autoregressive (AR) stage or Diffusion transformer (DiT) stage, each can choose corresponding schedulers, model workers to load with the Engines in a plug-in fashion.
!!! note
Default stage config YAMLs (for example, `vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml` and `vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml`) are bundled and loaded automatically when `stage_configs_path` is not provided. They have been verified to work on 1xH100 for Qwen2.5-Omni and 2xH100 for Qwen3-Omni.
Therefore, as a core part of vLLM-Omni, the stage configs for a model have several main functions:
- Claim partition of stages and their corresponding class implementation in `model_executor/models`.
- The disaggregated configuration for each stage and the communication topology among them.
- Engine arguments for each engine within the stage.
- Input and output dependencies for each stage.
- Default input parameters.
If users want to modify some part of it. The custom stage_configs file can be input as input argument in both online and offline. Just like examples below:
For offline (Assume necessary dependencies have ben imported):
```python
model_name = "Qwen/Qwen2.5-Omni-7B"
omni_llm = OmniLLM(model=model_name, stage_configs_path="/path/to/custom_stage_configs.yaml")
```
For online serving:
```bash
vllm serve Qwen/Qwen2.5-Omni-7B --omni --port 8091 --stage-configs-path /path/to/stage_configs_file
```
!!! important
We are actively iterating on the definition of stage configs, and we welcome all feedbacks from both community users and developers to help us shape the development!
Below is a specific example of stage_configs.yaml in Qwen2.5-omni.
```python
# stage config for running qwen2.5-omni with architecture of OmniLLM.
stage_args:
- stage_id: 0 # mark the unique id for each stage
runtime: # The disaggregated configuration
process: true # Run this stage in a separate process
devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
max_batch_size: 1 # the batch_size for offline inference
engine_args: # Engine arguments for a certain engine
model_stage: thinker
model_arch: Qwen2_5OmniForConditionalGeneration # The model implementation registered in model_executor/models/registry.py
worker_type: ar # The specific worker used
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler # The specific scehduler used
gpu_memory_utilization: 0.8 # The gpu memory allocation for the stage within a single chip
enforce_eager: true # Now we only support eager mode
trust_remote_code: true # Needed by huggingface config parsing
engine_output_type: latent # It claims that the stage will input latent hiddenstates besides token ids
enable_prefix_caching: false # For request with hiddenstates output, the prefix caching is not supported now
is_comprehension: true # If the stage is a text or multimodal comprehension module. If it is, the AsyncOmni will use its tokenizer as default
final_output: true # If the stage has output as part of final outputs. If it is false, which means that the stage only works as a intermediate role.
final_output_type: text # What is the final output type. It can be text and audio now.
default_sampling_params: # sampling parameters for the stage. Their meaning aligns with vLLM.
temperature: 0.0
top_p: 1.0
top_k: -1
max_tokens: 2048
seed: 42
detokenize: True
repetition_penalty: 1.1
- stage_id: 1
runtime:
process: true
devices: "1"
max_batch_size: 3
engine_args:
model_stage: talker
model_arch: Qwen2_5OmniForConditionalGeneration
worker_type: ar
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
gpu_memory_utilization: 0.8
enforce_eager: true
trust_remote_code: true
enable_prefix_caching: false
engine_output_type: latent
engine_input_source: [0]
custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker
default_sampling_params:
temperature: 0.9
top_p: 0.8
top_k: 40
max_tokens: 2048
seed: 42
detokenize: True
repetition_penalty: 1.05
stop_token_ids: [8294]
- stage_id: 2
runtime:
process: true
devices: "0" # Example: use a different GPU than the previous stage; use "0" if single GPU
max_batch_size: 1
engine_args:
model_stage: code2wav
model_arch: Qwen2_5OmniForConditionalGeneration
worker_type: generation
scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
gpu_memory_utilization: 0.15
enforce_eager: true
trust_remote_code: true
enable_prefix_caching: false
engine_output_type: audio
engine_input_source: [1]
final_output: true
final_output_type: audio
default_sampling_params:
temperature: 0.0
top_p: 1.0
top_k: -1
max_tokens: 2048
seed: 42
detokenize: True
repetition_penalty: 1.1
# Top-level runtime config (concise): default windows and stage edges
runtime:
enabled: true
defaults:
window_size: -1 # Simplified: trigger downstream only after full upstream completion
max_inflight: 1 # Simplified: process serially within each stage
edges:
- from: 0 # thinker → talker: trigger only after receiving full input (-1)
to: 1
window_size: -1
- from: 1 # talker → code2wav: trigger only after receiving full input (-1)
to: 2
window_size: -1
```
## Stage Configuration Arguments
Each stage in the `stage_args` list contains the following configuration options:
### `stage_id`
A unique identifier for each stage in the multi-stage pipeline. Stages are numbered sequentially starting from 0, and this ID is used to reference stages in inter-stage dependencies (e.g., `engine_input_source`).
### `runtime`
Configuration for disaggregated execution of the stage, controlling how the stage is deployed and executed.
#### `runtime.process`
Whether to run this stage in a separate process. When set to `true`, the stage will be executed in an isolated process, enabling better resource isolation and parallel execution across different stages. This is essential for multi-GPU deployments where different stages run on different devices.
Default: `true`
#### `runtime.devices`
Visible devices for this stage, specified as a string. This controls which GPU devices are available to the stage process, similar to setting `CUDA_VISIBLE_DEVICES` or using `torch.cuda.set_device()`. For example, `"0"` uses GPU 0, `"1"` uses GPU 1, and `"0,1"` makes both GPUs 0 and 1 visible.
Default: `"0"`
#### `runtime.max_batch_size`
The maximum batch size for offline inference in this stage. This limits how many sequences can be processed together in a single batch during offline inference operations.
Default: `1`
### `engine_args`
Engine arguments for configuring the LLM engine, diffusion engine, or other engine types used by this stage.
#### `engine_args.model_stage`
The name identifier for this model stage within the multi-stage architecture. This is used internally to distinguish different stages of the same model (e.g., "thinker", "talker", "code2wav" in Qwen2.5-Omni).
#### `engine_args.model_arch`
The model architecture class name that is registered in `model_executor/models/registry.py`. This specifies which model implementation to use for this stage. The class must be registered in the model registry for vLLM-Omni to locate and instantiate it.
#### `engine_args.worker_cls`
The specific worker class to use for this stage. This determines how the model computations are executed. Examples include `vllm_omni.worker.gpu_ar_worker.GPUARWorker` for autoregressive stages and `vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker` for diffusion-based stages.
#### `engine_args.scheduler_cls`
The scheduler class to use for this stage. The scheduler manages request queuing, batching, and execution order. Examples include `vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler` for standard stages and `vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler` for diffusion stages.
#### `engine_args.gpu_memory_utilization`
The fraction of GPU memory to allocate for this stage within a single GPU chip. This is a value between 0.0 and 1.0, where 0.8 means 80% of the GPU memory will be used by this stage. This allows fine-grained control over memory allocation when multiple stages share the same GPU or when reserving memory for other operations.
Default: `0.8`
!!! tip "Memory Configuration Guide"
For detailed information on how to calculate memory requirements and properly configure `gpu_memory_utilization`, see the [GPU Memory Calculation and Configuration Guide](./gpu_memory_utilization.md).
#### `engine_args.enforce_eager`
Whether to enforce eager execution mode. When set to `true`, the engine will run in eager mode without using CUDA graphs or other compilation optimizations. Currently, vLLM-Omni only supports eager mode.
Default: `true`
#### `engine_args.trust_remote_code`
Whether to trust remote code when loading models from Hugging Face. This is required for models that use custom code in their configuration files. Set to `true` when loading models that require custom model implementations.
Default: `true`
#### `engine_args.engine_output_type`
Specifies the type of output produced by this stage's engine. This determines what kind of data flows to downstream stages. Possible values include `latent` (hidden states), `text` (tokenized text), and `audio` (audio waveforms). When set to `latent`, the stage outputs latent hidden states in addition to token IDs, which are consumed by downstream stages.
Default: `latent`
#### `engine_args.enable_prefix_caching`
Whether to enable prefix caching for this stage. Prefix caching can improve performance by caching KV cache for common prompt prefixes. However, for requests that output hidden states (when `engine_output_type` is `latent`), prefix caching is not currently supported and should be set to `false`.
Default: `false`
### `is_comprehension`
Whether this stage is a text or multimodal comprehension module. When set to `true`, the stage acts as a comprehension module that processes input text or multimodal content. If this is the first comprehension stage, `AsyncOmni` will use its tokenizer as the default tokenizer for the entire pipeline.
Default: `true`
### `final_output`
Whether this stage produces output that is part of the final outputs returned to the user. When set to `false`, the stage only works as an intermediate stage, processing data that flows to downstream stages but not contributing directly to the final response.
Default: `true`
### `final_output_type`
The type of final output produced by this stage. This specifies what format the output will be in when returned to the user. Currently supported values are `text` (for text generation) and `audio` (for audio generation).
Default: `text`
### `default_sampling_params`
Default sampling parameters for this stage. These parameters control the generation behavior and align with vLLM's sampling parameter semantics. These defaults are used when no explicit sampling parameters are provided in the request.
#### `default_sampling_params.temperature`
Sampling temperature for controlling randomness. Lower values (e.g., 0.0) make the output more deterministic and focused, while higher values increase randomness.
Default: `0.0`
#### `default_sampling_params.top_p`
Nucleus sampling parameter. Only tokens with cumulative probability mass up to `top_p` are considered. This helps filter out low-probability tokens while maintaining diversity.
Default: `1.0`
#### `default_sampling_params.top_k`
Top-k sampling parameter. Only the top `k` most likely tokens are considered. Set to `-1` to disable top-k filtering and consider all tokens.
Default: `-1`
#### `default_sampling_params.max_tokens`
Maximum number of tokens to generate in this stage. This limits the length of the output sequence.
Default: `2048`
#### `default_sampling_params.seed`
Random seed for reproducible generation. When set, the random number generator will be initialized with this seed to ensure consistent outputs across runs.
Default: `42`
#### `default_sampling_params.detokenize`
Whether to detokenize the output tokens into text. When set to `true`, token IDs are converted back to readable text strings.
Default: `True`
#### `default_sampling_params.repetition_penalty`
Penalty applied to tokens that have already appeared in the generated sequence. Values greater than 1.0 discourage repetition, while values less than 1.0 encourage it. A value of 1.0 applies no penalty.
Default: `1.1`
# stage config for running qwen2.5-omni with architecture of OmniLLM.
stage_args:
- stage_id: 0
runtime:
process: true # Run this stage in a separate process
devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
max_batch_size: 1
engine_args:
model_stage: thinker
model_arch: Qwen2_5OmniForConditionalGeneration
worker_type: ar
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
gpu_memory_utilization: 0.8
enforce_eager: true # Now we only support eager mode
trust_remote_code: true
engine_output_type: latent
enable_prefix_caching: false
is_comprehension: true
final_output: true
final_output_type: text
default_sampling_params:
temperature: 0.0
top_p: 1.0
top_k: -1
max_tokens: 2048
seed: 42
detokenize: True
repetition_penalty: 1.1
- stage_id: 1
runtime:
process: true
devices: "1"
max_batch_size: 1
engine_args:
model_stage: talker
model_arch: Qwen2_5OmniForConditionalGeneration
worker_type: ar
scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
gpu_memory_utilization: 0.8
enforce_eager: true
trust_remote_code: true
enable_prefix_caching: false
engine_output_type: latent
engine_input_source: [0]
custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker
default_sampling_params:
temperature: 0.9
top_p: 0.8
top_k: 40
max_tokens: 2048
seed: 42
detokenize: True
repetition_penalty: 1.05
stop_token_ids: [8294]
- stage_id: 2
runtime:
process: true
devices: "0" # Example: use a different GPU than the previous stage; use "0" if single GPU
max_batch_size: 1
engine_args:
model_stage: code2wav
model_arch: Qwen2_5OmniForConditionalGeneration
worker_type: generation
scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
gpu_memory_utilization: 0.15
enforce_eager: true
trust_remote_code: true
enable_prefix_caching: false
engine_output_type: audio
engine_input_source: [1]
final_output: true
final_output_type: audio
default_sampling_params:
temperature: 0.0
top_p: 1.0
top_k: -1
max_tokens: 2048
seed: 42
detokenize: True
repetition_penalty: 1.1
# Top-level runtime config (concise): default windows and stage edges
runtime:
enabled: true
defaults:
window_size: -1 # Simplified: trigger downstream only after full upstream completion
max_inflight: 1 # Simplified: process serially within each stage
edges:
- from: 0 # thinker → talker: trigger only after receiving full input (-1)
to: 1
window_size: -1
- from: 1 # talker → code2wav: trigger only after receiving full input (-1)
to: 2
window_size: -1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment