Commit c1cacde6 authored by weishb's avatar weishb
Browse files

vllm-omni_0.15.0.rc1+fix1 first commit

parent 35607782
import argparse
import json
import os
import soundfile as sf
from qwen3_omni_moe_model import Qwen3OmniMoeForConditionalGenerationWithLogging
from qwen_omni_utils import process_mm_info
from tqdm import tqdm
from transformers import Qwen3OmniMoeProcessor
MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
# MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Thinking"
def load_prompts(prompts_file: str) -> list[str]:
"""Load prompts from a text file, one prompt per line."""
prompts = []
with open(prompts_file, encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
prompts.append(line)
return prompts
def run_benchmark(
model,
processor,
prompts: list[str],
output_dir: str = "benchmark_results",
speaker: str = "Ethan",
use_audio_in_video: bool = True,
):
"""
Run benchmark on a list of prompts and collect performance stats.
Args:
model: The Qwen3OmniMoe model
processor: The Qwen3OmniMoe processor
prompts: List of text prompts to process
output_dir: Directory to save results
speaker: Speaker voice for audio output
use_audio_in_video: Whether to use audio in video
Returns:
tuple: (aggregated_stats, results, audio_outputs)
- aggregated_stats: dict with aggregated performance statistics
- results: list of dicts with per-prompt results
- audio_outputs: list of audio tensors/arrays (or None if no audio)
"""
os.makedirs(output_dir, exist_ok=True)
audio_dir = os.path.join(output_dir, "audio")
os.makedirs(audio_dir, exist_ok=True)
all_stats = []
results = []
audio_outputs = []
for idx, prompt in enumerate(tqdm(prompts, desc="Processing prompts")):
conversation = [
{
"role": "user",
"content": [{"type": "text", "text": prompt}],
},
]
# Preparation for inference
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
audios, images, videos = process_mm_info(conversation, use_audio_in_video=use_audio_in_video)
inputs = processor(
text=text,
audio=audios,
images=images,
videos=videos,
return_tensors="pt",
padding=True,
use_audio_in_video=use_audio_in_video,
)
inputs = inputs.to(model.device).to(model.dtype)
# Inference: Generation of the output text and audio
text_ids, audio = model.generate(
**inputs, speaker=speaker, thinker_return_dict_in_generate=True, use_audio_in_video=use_audio_in_video
)
# Decode output text
output_text = processor.batch_decode(
text_ids.sequences[:, inputs["input_ids"].shape[1] :],
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
)[0]
# Collect performance stats
perf_stats = None
if hasattr(model, "_perf_stats_last"):
perf_stats = model._perf_stats_last.copy()
perf_stats["prompt_idx"] = idx
perf_stats["prompt"] = prompt
all_stats.append(perf_stats)
# Save audio and collect audio output
audio_path = None
audio_data = None
if audio is not None:
audio_data = audio.reshape(-1).detach().cpu().numpy()
audio_path = os.path.join(audio_dir, f"output_{idx:04d}.wav")
sf.write(
audio_path,
audio_data,
samplerate=24000,
)
audio_outputs.append(audio_data)
else:
audio_outputs.append(None)
# Save result
result = {
"idx": idx,
"prompt": prompt,
"output": output_text,
"audio_path": audio_path,
"perf_stats": perf_stats,
}
results.append(result)
# Aggregate statistics
aggregated_stats = aggregate_stats(all_stats)
# Save all results
results_path = os.path.join(output_dir, "results.json")
with open(results_path, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
# Save aggregated stats
stats_path = os.path.join(output_dir, "perf_stats.json")
with open(stats_path, "w", encoding="utf-8") as f:
json.dump({"aggregated": aggregated_stats, "per_prompt": all_stats}, f, ensure_ascii=False, indent=2)
# Count saved audio files
num_audio_saved = sum(1 for a in audio_outputs if a is not None)
print(f"\nSaved {num_audio_saved} audio files to {audio_dir}/")
return aggregated_stats, results, audio_outputs
def aggregate_stats(all_stats: list[dict]) -> dict:
"""Aggregate performance statistics from multiple runs."""
if not all_stats:
return {}
keys = [
"thinker_tokens",
"thinker_time_s",
"thinker_tps",
"talker_tokens",
"talker_time_s",
"talker_tps",
"code2wav_tokens",
"code2wav_time_s",
"code2wav_tps",
"total_tokens",
"total_time_s",
"total_tps",
]
aggregated = {
"num_samples": len(all_stats),
}
for key in keys:
values = [s.get(key, 0) for s in all_stats if key in s]
if values:
aggregated[f"{key}_sum"] = sum(values)
aggregated[f"{key}_avg"] = sum(values) / len(values)
aggregated[f"{key}_min"] = min(values)
aggregated[f"{key}_max"] = max(values)
# Calculate overall throughput
total_tokens = aggregated.get("total_tokens_sum", 0)
total_time = aggregated.get("total_time_s_sum", 0)
if total_time > 0:
aggregated["overall_tps"] = total_tokens / total_time
return aggregated
def print_stats(stats: dict):
"""Print performance statistics in a formatted way."""
print("\n" + "=" * 60)
print("Performance Statistics Summary")
print("=" * 60)
print(f"\nNumber of samples: {stats.get('num_samples', 0)}")
print("\n--- Thinker ---")
print(f" Total tokens: {stats.get('thinker_tokens_sum', 0):.0f}")
print(f" Total time: {stats.get('thinker_time_s_sum', 0):.2f}s")
print(f" Avg TPS: {stats.get('thinker_tps_avg', 0):.2f}")
print(f" Min TPS: {stats.get('thinker_tps_min', 0):.2f}")
print(f" Max TPS: {stats.get('thinker_tps_max', 0):.2f}")
print("\n--- Talker ---")
print(f" Total tokens: {stats.get('talker_tokens_sum', 0):.0f}")
print(f" Total time: {stats.get('talker_time_s_sum', 0):.2f}s")
print(f" Avg TPS: {stats.get('talker_tps_avg', 0):.2f}")
print(f" Min TPS: {stats.get('talker_tps_min', 0):.2f}")
print(f" Max TPS: {stats.get('talker_tps_max', 0):.2f}")
print("\n--- Code2Wav ---")
print(f" Total tokens: {stats.get('code2wav_tokens_sum', 0):.0f}")
print(f" Total time: {stats.get('code2wav_time_s_sum', 0):.2f}s")
print(f" Avg TPS: {stats.get('code2wav_tps_avg', 0):.2f}")
print(f" Min TPS: {stats.get('code2wav_tps_min', 0):.2f}")
print(f" Max TPS: {stats.get('code2wav_tps_max', 0):.2f}")
print("\n--- Overall ---")
print(f" Total tokens: {stats.get('total_tokens_sum', 0):.0f}")
print(f" Total time: {stats.get('total_time_s_sum', 0):.2f}s")
print(f" Overall TPS: {stats.get('overall_tps', 0):.2f}")
print(f" Avg TPS: {stats.get('total_tps_avg', 0):.2f}")
print(f" Min TPS: {stats.get('total_tps_min', 0):.2f}")
print(f" Max TPS: {stats.get('total_tps_max', 0):.2f}")
print("=" * 60 + "\n")
def main():
parser = argparse.ArgumentParser(description="Qwen3-Omni Benchmark Script")
parser.add_argument(
"--prompts_file",
type=str,
default="benchmark/build_dataset/top100.txt",
help="Path to the prompts file (one prompt per line)",
)
parser.add_argument(
"--output_dir", type=str, default="benchmark_results", help="Directory to save benchmark results"
)
parser.add_argument("--model_path", type=str, default=MODEL_PATH, help="Path to the model")
parser.add_argument("--speaker", type=str, default="Ethan", help="Speaker voice for audio output")
parser.add_argument("--num_prompts", type=int, default=None, help="Number of prompts to process (default: all)")
args = parser.parse_args()
# Load model and processor
print(f"Loading model from {args.model_path}...")
model = Qwen3OmniMoeForConditionalGenerationWithLogging.from_pretrained(
args.model_path,
dtype="auto",
device_map="auto",
attn_implementation="flash_attention_2",
)
processor = Qwen3OmniMoeProcessor.from_pretrained(args.model_path)
# Benchmark mode
print(f"Loading prompts from {args.prompts_file}...")
prompts = load_prompts(args.prompts_file)
if args.num_prompts:
prompts = prompts[: args.num_prompts]
print(f"Running benchmark on {len(prompts)} prompts...")
aggregated_stats, results, audio_outputs = run_benchmark(
model=model,
processor=processor,
prompts=prompts,
output_dir=args.output_dir,
speaker=args.speaker,
)
print_stats(aggregated_stats)
print(f"\nResults saved to {args.output_dir}/")
if __name__ == "__main__":
main()
#!/bin/bash
# Qwen3-Omni Benchmark Evaluation Script
# This script must be run from the vllm-omni root directory
# Get the directory where this script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Navigate to vllm-omni root directory (4 levels up from script location)
VLLM_OMNI_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
cd "$VLLM_OMNI_ROOT" || { echo "Error: Failed to navigate to vllm-omni directory"; exit 1; }
echo "Working directory: $(pwd)"
# Verify we're in the correct directory and run benchmark
if [[ ! -d "benchmarks/qwen3-omni/vllm_omni" ]]; then
echo "Error: Not in vllm-omni root directory. Please run from vllm-omni folder."
else
log_dir=benchmarks/qwen3-omni/vllm_omni/logs
outputs_dir=benchmarks/qwen3-omni/vllm_omni/outputs
end2end_script_path=examples/offline_inference/qwen3_omni/end2end.py
build_dataset_path=benchmarks/build_dataset/top100.txt
python $end2end_script_path --output-wav $outputs_dir \
--query-type text \
--txt-prompts $build_dataset_path \
--enable-stats \
--log-dir $log_dir
echo "Logs and outputs are saved in ${log_dir} and ${outputs_dir} respectively:"
echo " - omni_llm_pipeline_text run dir/base name"
echo " - omni_llm_pipeline_text.orchestrator.stats.jsonl orchestrator-stage latency stats"
echo " - omni_llm_pipeline_text.overall.stats.jsonl overall latency/TPS stats"
echo " - omni_llm_pipeline_text.stage0.log per-stage detailed logs"
echo " - omni_llm_pipeline_text.stage1.log"
echo " - omni_llm_pipeline_text.stage2.log"
echo "Key checks: overall.stats.jsonl for end-to-end latency/TPS; orchestrator.stats.jsonl for stable per-stage latency; stage*.log for errors or long tails."
echo " - outputs/ Generated txt and wav files, there should be 100 text and wav files generated respectively"
fi
This diff is collapsed.
ARG VLLM_BASE_IMAGE=vllm/vllm-openai
ARG VLLM_BASE_TAG=v0.15.0
FROM ${VLLM_BASE_IMAGE}:${VLLM_BASE_TAG}
ARG APP_DIR=/workspace/vllm-omni
WORKDIR ${APP_DIR}
COPY . .
# Install system dependencies
RUN apt-get update && \
apt-get install -y ffmpeg && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Install vllm-omni into the same uv-managed Python environment used by the base image.
RUN uv pip install --python "$(python3 -c 'import sys; print(sys.executable)')" --no-cache-dir ".[dev]"
RUN ln -sf /usr/bin/python3 /usr/bin/python
ENTRYPOINT []
ARG VLLM_ASCEND_IMAGE=quay.nju.edu.cn/ascend/vllm-ascend
ARG VLLM_ASCEND_TAG=v0.11.0rc2
FROM ${VLLM_ASCEND_IMAGE}:${VLLM_ASCEND_TAG}
ARG APP_DIR=/vllm-workspace/vllm-omni
WORKDIR ${APP_DIR}
COPY . .
# Install vllm-omni with dev dependencies
RUN pip install --no-cache-dir -e ".[dev]"
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENTRYPOINT []
ARG BASE_IMAGE=vllm/vllm-openai-rocm:v0.15.0
FROM ${BASE_IMAGE} AS final
ARG COMMON_WORKDIR=/app
WORKDIR ${COMMON_WORKDIR}
# Step 1: Setup - Install system dependencies
RUN apt-get update && \
apt-get install -y ffmpeg && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN mkdir -p ${COMMON_WORKDIR}/vllm-omni
# Step 2: Copy vllm-omni code and install without uv
COPY . ${COMMON_WORKDIR}/vllm-omni
RUN cd ${COMMON_WORKDIR}/vllm-omni && uv pip install --python "$(python3 -c 'import sys; print(sys.executable)')" --no-cache-dir ".[dev]"
# When we are installing onnxruntime-rocm, we need to uninstall the system-installed onnxruntime first.
# These are the dependencies of Qwen3-TTS.
RUN uv pip uninstall onnxruntime --system && uv pip install --no-cache-dir onnxruntime-rocm sox --system
RUN ln -sf /usr/bin/python3 /usr/bin/python
CMD ["/bin/bash"]
ENTRYPOINT []
#Set entrypoint for vllm-openai official images
FROM final AS vllm-openai
ENTRYPOINT ["vllm", "serve", "--omni"]
nav:
- Home: README.md
- User Guide:
- Getting Started:
- getting_started/quickstart.md
- getting_started/installation/*
- Serving:
- OpenAI-Compatible API:
- Image Generation: serving/image_generation_api.md
- Image Edit: serving/image_edit_api.md
- Examples:
- examples/README.md
- Offline Inference:
- Image-To-Image: user_guide/examples/offline_inference/image_to_image.md
- Image-To-Video: user_guide/examples/offline_inference/image_to_video.md
- Qwen2.5-Omni: user_guide/examples/offline_inference/qwen2_5_omni.md
- Qwen3-Omni: user_guide/examples/offline_inference/qwen3_omni.md
- Qwen3-TTS Offline Inference: user_guide/examples/offline_inference/qwen3_tts.md
- Text-To-Image: user_guide/examples/offline_inference/text_to_image.md
- Text-To-Video: user_guide/examples/offline_inference/text_to_video.md
- Online Serving:
- Image-To-Image: user_guide/examples/online_serving/image_to_image.md
- Qwen2.5-Omni: user_guide/examples/online_serving/qwen2_5_omni.md
- Qwen3-Omni: user_guide/examples/online_serving/qwen3_omni.md
- Text-To-Image: user_guide/examples/online_serving/text_to_image.md
- General:
- usage/*
- Configuration:
- configuration/README.md
- configuration/*
- Models:
- models/supported_models.md
- Features:
- Sleep Mode: features/sleep_mode.md
- Diffusion Features:
- Overview: user_guide/diffusion_acceleration.md
- TeaCache: user_guide/diffusion/teacache.md
- Cache-DiT: user_guide/diffusion/cache_dit_acceleration.md
- Parallelism Acceleration: user_guide/diffusion/parallelism_acceleration.md
- CPU Offloading: user_guide/diffusion/cpu_offload_diffusion.md
- Developer Guide:
- General:
- contributing/README.md
- glob: contributing/*
flatten_single_child_sections: true
- Model Implementation:
- contributing/model/README.md
- contributing/model/adding_omni_model.md
- contributing/model/adding_diffusion_model.md
- CI: contributing/ci
- Design Documents:
- design/index.md
- design/architecture_overview.md
- Feature Design:
- design/feature/disaggregated_inference.md
- design/feature/ray_based_execution.md
- Module Design:
- design/module/ar_module.md
- design/module/dit_module.md
- design/module/entrypoint_module.md
- Docs Guide: contributing/DOCS_GUIDE.md
- API Reference:
- api/README.md
- api/vllm_omni
- CLI Reference: cli
- Community:
- community/*
- Slack: https://slack.vllm.ai
- Blog: https://blog.vllm.ai
- Forum: https://discuss.vllm.ai
---
hide:
- navigation
- toc
---
# Welcome to vLLM-Omni
<p align="center">
<picture>
<source media="(prefers-color-scheme: dark)" src="./source/logos/vllm-omni-logo.png">
<img alt="vllm-omni" src="./source/logos/vllm-omni-logo.png" width=55%>
</picture>
</p>
<h3 align="center">
Easy, fast, and cheap omni-modality model serving for everyone
</h3>
<p style="text-align:center">
<script async defer src="https://buttons.github.io/buttons.js"></script>
<a class="github-button" href="https://github.com/vllm-project/vllm-omni" data-show-count="true" data-size="large" aria-label="Star">Star</a>
<a class="github-button" href="https://github.com/vllm-project/vllm-omni/subscription" data-show-count="true" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
<a class="github-button" href="https://github.com/vllm-project/vllm-omni/fork" data-show-count="true" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
</p>
## About
[vLLM](https://github.com/vllm-project/vllm) was originally designed to support large language models for text-based autoregressive generation tasks. vLLM-Omni is a framework that extends its support for omni-modality model inference and serving:
- **Omni-modality**: Text, image, video, and audio data processing
- **Non-autoregressive Architectures**: extend the AR support of vLLM to Diffusion Transformers (DiT) and other parallel generation models
- **Heterogeneous outputs**: from traditional text generation to multimodal outputs
<p align="center">
<picture>
<source media="(prefers-color-scheme: dark)" src="./source/architecture/omni-modality-model-architecture.png">
<img alt="vllm-omni-arch" src="./source/architecture/omni-modality-model-architecture.png" width=55%>
</picture>
</p>
vLLM-Omni is fast with:
- State-of-the-art AR support by leveraging efficient KV cache management from vLLM
- Pipelined stage execution overlapping for high throughput performance
- Fully disaggregation based on OmniConnector and dynamic resource allocation across stages
vLLM-Omni is flexible and easy to use with:
- Heterogeneous pipeline abstraction to manage complex model workflows
- Seamless integration with popular Hugging Face models
- Tensor, pipeline, data and expert parallelism support for distributed inference
- Streaming outputs
- OpenAI-compatible API server
vLLM-Omni seamlessly supports most popular open-source models on HuggingFace, including:
- Omni-modality models (e.g. Qwen2.5-Omni, Qwen3-Omni)
- Multi-modality generation models (e.g. Qwen-Image)
For more information, checkout the following:
- [vllm-omni architecture design and recent roadmaps](https://docs.google.com/presentation/d/1qv4qMW1rKAqDREMXiUDLIgqqHQe7TDPj/edit?usp=sharing&ouid=110473603432222024453&rtpof=true&sd=true)
- [vllm-omni announcement blogpost](https://blog.vllm.ai/2025/11/30/vllm-omni.html)
# Summary
## Entry Points
Main entry points for vLLM-Omni inference and serving.
- [vllm_omni.entrypoints.async_omni.AsyncOmni][]
- [vllm_omni.entrypoints.async_omni_diffusion.AsyncOmniDiffusion][]
- [vllm_omni.entrypoints.async_omni_llm.AsyncOmniLLM][]
- [vllm_omni.entrypoints.chat_utils.OmniAsyncMultiModalContentParser][]
- [vllm_omni.entrypoints.chat_utils.OmniAsyncMultiModalItemTracker][]
- [vllm_omni.entrypoints.chat_utils.parse_chat_messages_futures][]
- [vllm_omni.entrypoints.cli.serve.OmniServeCommand][]
- [vllm_omni.entrypoints.client_request_state.ClientRequestState][]
- [vllm_omni.entrypoints.log_utils.OrchestratorMetrics][]
- [vllm_omni.entrypoints.log_utils.StageRequestMetrics][]
- [vllm_omni.entrypoints.log_utils.StageStats][]
- [vllm_omni.entrypoints.omni.Omni][]
- [vllm_omni.entrypoints.omni.OmniBase][]
- [vllm_omni.entrypoints.omni_diffusion.OmniDiffusion][]
- [vllm_omni.entrypoints.omni_llm.OmniLLM][]
- [vllm_omni.entrypoints.omni_stage.OmniStage][]
- [vllm_omni.entrypoints.stage_utils.OmniStageTaskType][]
## Inputs
Input data structures for multi-modal inputs.
- [vllm_omni.inputs.data.OmniEmbedsPrompt][]
- [vllm_omni.inputs.data.OmniTokenInputs][]
- [vllm_omni.inputs.data.OmniTokensPrompt][]
- [vllm_omni.inputs.parse.parse_singleton_prompt_omni][]
- [vllm_omni.inputs.preprocess.OmniInputPreprocessor][]
## Engine
Engine classes for offline and online inference.
- [vllm_omni.diffusion.diffusion_engine.DiffusionEngine][]
- [vllm_omni.engine.AdditionalInformationEntry][]
- [vllm_omni.engine.AdditionalInformationPayload][]
- [vllm_omni.engine.OmniEngineCoreOutput][]
- [vllm_omni.engine.OmniEngineCoreOutputs][]
- [vllm_omni.engine.OmniEngineCoreRequest][]
- [vllm_omni.engine.PromptEmbedsPayload][]
- [vllm_omni.engine.arg_utils.AsyncOmniEngineArgs][]
- [vllm_omni.engine.arg_utils.OmniEngineArgs][]
- [vllm_omni.engine.input_processor.OmniInputProcessor][]
- [vllm_omni.engine.output_processor.MultimodalOutputProcessor][]
- [vllm_omni.engine.output_processor.OmniRequestState][]
## Core
Core scheduling and caching components.
- [vllm_omni.core.sched.omni_ar_scheduler.KVCacheTransferData][]
- [vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler][]
- [vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler][]
- [vllm_omni.core.sched.output.OmniCachedRequestData][]
- [vllm_omni.core.sched.output.OmniNewRequestData][]
- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.DistributedGroupResidualVectorQuantization][]
- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.DistributedResidualVectorQuantization][]
- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.EuclideanCodebook][]
- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.VectorQuantization][]
- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.preprocess][]
## Configuration
Configuration classes.
- [vllm_omni.config.model.OmniModelConfig][]
- [vllm_omni.diffusion.cache.teacache.config.TeaCacheConfig][]
- [vllm_omni.distributed.omni_connectors.utils.config.ConnectorSpec][]
- [vllm_omni.distributed.omni_connectors.utils.config.OmniTransferConfig][]
- [vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts.Qwen3TTSConfig][]
- [vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts.Qwen3TTSSpeakerEncoderConfig][]
- [vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts.Qwen3TTSTalkerCodePredictorConfig][]
- [vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts.Qwen3TTSTalkerConfig][]
- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_12hz.configuration_qwen3_tts_tokenizer_v2.Qwen3TTSTokenizerV2Config][]
- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_12hz.configuration_qwen3_tts_tokenizer_v2.Qwen3TTSTokenizerV2DecoderConfig][]
- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1Config][]
- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1DecoderBigVGANConfig][]
- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1DecoderConfig][]
- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1DecoderDiTConfig][]
- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1EncoderConfig][]
## Workers
Worker classes and model runners for distributed inference.
- [vllm_omni.diffusion.worker.gpu_diffusion_model_runner.GPUDiffusionModelRunner][]
- [vllm_omni.diffusion.worker.gpu_diffusion_worker.GPUDiffusionWorker][]
- [vllm_omni.diffusion.worker.gpu_diffusion_worker.WorkerProc][]
- [vllm_omni.diffusion.worker.npu.npu_worker.NPUWorker][]
- [vllm_omni.diffusion.worker.npu.npu_worker.NPUWorkerProc][]
- [vllm_omni.worker.gpu_ar_model_runner.ExecuteModelState][]
- [vllm_omni.worker.gpu_ar_model_runner.GPUARModelRunner][]
- [vllm_omni.worker.gpu_ar_worker.GPUARWorker][]
- [vllm_omni.worker.gpu_generation_model_runner.GPUGenerationModelRunner][]
- [vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker][]
- [vllm_omni.worker.gpu_model_runner.OmniGPUModelRunner][]
- [vllm_omni.worker.npu.npu_ar_model_runner.ExecuteModelState][]
- [vllm_omni.worker.npu.npu_ar_model_runner.NPUARModelRunner][]
- [vllm_omni.worker.npu.npu_ar_worker.NPUARWorker][]
- [vllm_omni.worker.npu.npu_generation_model_runner.NPUGenerationModelRunner][]
- [vllm_omni.worker.npu.npu_generation_worker.NPUGenerationWorker][]
- [vllm_omni.worker.npu.npu_model_runner.OmniNPUModelRunner][]
# vLLM-Omni CLI Guide
The CLI for vLLM-Omni inherits from vllm with some additional arguments.
## serve
Starts the vLLM-Omni OpenAI Compatible API server.
Start with a model:
```bash
vllm serve Qwen/Qwen2.5-Omni-7B --omni
```
Specify the port:
```bash
vllm serve Qwen/Qwen2.5-Omni-7B --omni --port 8091
```
If you have custom stage configs file, launch the server with command below
```bash
vllm serve Qwen/Qwen2.5-Omni-7B --omni --stage-configs-path /path/to/stage_configs_file
```
## bench
Run benchmark tests for online serving throughput.
Available Commands:
```bash
vllm bench serve --omni \
--model Qwen/Qwen2.5-Omni-7B \
--host server-host \
--port server-port \
--random-input-len 32 \
--random-output-len 4 \
--num-prompts 5
```
See [vllm bench serve](./bench/serve.md) for the full reference of all available arguments.
This diff is collapsed.
# Contact Us
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm-omni/issues)
- For coordinating contributions and development and discussing with other users and developers, please join `sig-omni` channel in our [Slack](https://slack.vllm.ai/) or use the [vLLM Forum](https://discuss.vllm.ai/)
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm-omni/security/advisories) feature
# Volunteers for Bugfix and CI
We encourage you to check current docs and [issues](https://github.com/vllm-project/vllm-omni/issues) to find possible solutions for your questions. If non of these can solve it, please propose an issue to describe your questions about bug or CI problems for developing.
If you have urgent need for locating and solving bugfix or CI problems, please find community volunteers below.
| Dec 4-Dec 12 | Dec 15-Dec 19 | Dec 22-Dec 26 | Dec 29- Jan 2, 2026| Jan 5-Jan 9 | Jan 12-Jan 16 |
|----------|----------|----------|----------|----------|----------|
| <a href="https://github.com/congw729">Conw729</a> | <a href="https://github.com/yinpeiqi">yinpeiqi</a> | <a href="https://github.com/tzhouam">tzhouam</a> | <a href="https://github.com/SamitHuang">SamitHuang</a> | <a href="https://github.com/gcanlin">gcanlin</a> | <a href="https://github.com/natureofnature">natureofnature</a> |
| <a href="https://github.com/david6666666">david6666666</a> | <a href="https://github.com/R2-Y">R2-Y</a> | <a href="https://github.com/hsliuustc0106">hsliuustc0106</a> | <a href="https://github.com/Gaohan123">Gaohan123</a> | <a href="https://github.com/ZJY0516">ZJY0516</a> | <a href="https://github.com/qibaoyuan">qibaoyuan</a> |
We kindly welcome more contributors to fix bugs and contribute new features!
# Configuration Options
This section lists the most common options for running vLLM-Omni.
For options within a vLLM Engine. Please refer to [vLLM Configuration](https://docs.vllm.ai/en/v0.14.0/configuration/index.html)
Currently, the main options are maintained by stage configs for each model.
For specific example, please refer to [Qwen2.5-omni stage config](stage_configs/qwen2_5_omni.yaml)
For introduction, please check [Introduction for stage config](./stage_configs.md)
## Memory Configuration
- **[GPU Memory Calculation and Configuration](./gpu_memory_utilization.md)** - Guide on how to calculate memory requirements and set up `gpu_memory_utilization` for optimal performance
## Optimization Features
- **[TeaCache Configuration](../user_guide/diffusion/teacache.md)** - Enable TeaCache adaptive caching for DiT models to achieve 1.5x-2.0x speedup with minimal quality loss
- **[Cache-DiT Configuration](../user_guide/diffusion/cache_dit_acceleration.md)** - Enable Cache-DiT as cache acceleration backends for DiT models
- **[Parallelism Configuration](../user_guide/diffusion/parallelism_acceleration.md)** - Enable parallelism (e.g., sequence parallelism) for for DiT models
# GPU Memory Calculation and Configuration
This guide explains how to calculate GPU memory requirements and properly configure `gpu_memory_utilization` for vLLM-Omni stages.
## Overview
`gpu_memory_utilization` is a critical parameter that controls how much GPU memory each stage can use. It's specified as a fraction between 0.0 and 1.0, where:
- `0.8` means 80% of the GPU's total memory
- `1.0` means 100% of the GPU's total memory (not recommended, leaves no buffer)
## How Memory is Calculated
### Memory Allocation Formula
For each stage, vLLM-Omni calculates the requested memory as:
```
requested_memory = total_gpu_memory × gpu_memory_utilization
```
The system checks that:
```
free_memory ≥ requested_memory
```
If this condition is not met, the stage will fail to initialize with an error message showing the memory requirements.
### Memory Components
The total memory used by a stage includes:
1. **Model Weights**: The size of the model parameters loaded on the GPU
2. **KV Cache**: Memory for storing key-value cache during generation
3. **Activation Memory**: Temporary memory for intermediate computations
4. **System Overhead**: Memory used by CUDA, PyTorch, and other system components
5. **Non-Torch Memory**: Memory allocated outside of PyTorch (e.g., CUDA graphs)
### Example Calculation
For a GPU with 80GB total memory:
- `gpu_memory_utilization: 0.8` → 64GB available for the stage
- `gpu_memory_utilization: 0.6` → 48GB available for the stage
- `gpu_memory_utilization: 0.15` → 12GB available for the stage
## Setting Up `gpu_memory_utilization`
### Step 1: Determine GPU Memory
First, check your GPU's total memory:
```bash
# Using nvidia-smi
nvidia-smi --query-gpu=memory.total --format=csv
# Or using Python
python -c "import torch; print(f'{torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')"
```
### Step 2: Estimate Model Memory Requirements
#### For Autoregressive (AR) Stages
AR stages typically need more memory due to:
- Large model weights
- KV cache for attention
- Activation buffers
#### For Diffusion/Generation Stages
Diffusion stages (like code2wav) typically need less memory:
- Smaller model components
- Different memory access patterns
**Typical values:**
- `0.1 - 0.3` for most diffusion stages
### Step 3: Consider Multi-Stage Scenarios
When multiple stages share the same GPU, you must ensure the sum of their `gpu_memory_utilization` values doesn't exceed 1.0.
**Example: Two stages on GPU 0**
```yaml
stage_args:
- stage_id: 0
runtime:
devices: "0"
engine_args:
gpu_memory_utilization: 0.6 # Uses 60% of GPU 0
- stage_id: 1
runtime:
devices: "0"
engine_args:
gpu_memory_utilization: 0.3 # Uses 30% of GPU 0
# Total: 90% of GPU 0 (safe, leaves 10% buffer)
```
**Important:** If stages run on different GPUs, each can use up to 1.0 independently.
### Step 4: Account for Tensor Parallelism
When using `tensor_parallel_size > 1`, the model is split across multiple GPUs, so each GPU needs less memory.
**Example: 2-way tensor parallelism**
```yaml
stage_args:
- stage_id: 0
runtime:
devices: "0,1" # Uses both GPUs
engine_args:
tensor_parallel_size: 2
gpu_memory_utilization: 0.6 # 60% per GPU
# Model is split, so each GPU uses ~30% of model memory
```
## Examples
### Qwen3-Omni-MoE on 2x H100-80GB
```yaml
stage_args:
- stage_id: 0 # Thinker stage with TP=2
runtime:
devices: "0,1"
engine_args:
tensor_parallel_size: 2
gpu_memory_utilization: 0.6 # 48GB per GPU
- stage_id: 1 # Talker stage
runtime:
devices: "1"
engine_args:
gpu_memory_utilization: 0.3 # 24GB on GPU 1
- stage_id: 2 # Code2Wav stage
runtime:
devices: "0"
engine_args:
gpu_memory_utilization: 0.1 # 8GB on GPU 0
```
**Note:** In this configuration, stages 0 and 2 share GPU 0, but they run at different times in the pipeline, so their memory usage doesn't overlap.
## Troubleshooting
### Error: "Free memory is less than desired GPU memory utilization"
This means the GPU doesn't have enough free memory when the stage starts.
**Solutions:**
1. Free up memory by closing other processes
2. Reduce `gpu_memory_utilization` for this stage
3. Use a GPU with more memory
4. Move the stage to a different GPU
### Error: OOM during inference
The stage initialized but ran out of memory during processing.
**Solutions:**
1. Reduce `max_num_batched_tokens`
2. Reduce `max_batch_size` in runtime config
3. Lower `gpu_memory_utilization` slightly
4. Enable quantization if supported
### Memory Not Fully Utilized
If you see low memory usage, you can:
1. Increase `gpu_memory_utilization` to allow larger KV cache
2. Increase `max_num_batched_tokens` for better batching
3. Check if other stages are limiting throughput
## Useful formula for Memory Calculation
### KV Cache Memory
The KV cache size depends on:
- Number of sequences in batch
- Sequence length (prompt + generation)
- Model hidden size
- Number of attention heads
- Number of layers
approximate Formula:
```
kv_cache_memory ≈ batch_size × seq_len × hidden_size × num_layers × 2 × dtype_size
```
2 for k & v
### Model Weight Memory
```
model_memory ≈ num_parameters × dtype_size
```
For example:
- 7B parameters in FP16: ~14GB
- 7B parameters in FP32: ~28GB
- 7B parameters in INT8: ~7GB
### Activation Memory
Activation memory is typically smaller but varies with:
- Batch size
- Sequence length
- Model architecture
It's usually 10-30% of model weight memory during inference.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment