vllm-omni_0.15.0.rc1+fix1 first commit

c1cacde6 · weishb · 35607782 · c1cacde6 · c1cacde6 · c1cacde6
Commit c1cacde6 authored Mar 25, 2026 by weishb
20 changed files
--- a/benchmarks/qwen3-omni/transformers/qwen3_omni_moe_transformers.py
+++ b/benchmarks/qwen3-omni/transformers/qwen3_omni_moe_transformers.py
+import argparse
+import json
+import os
+
+import soundfile as sf
+from qwen3_omni_moe_model import Qwen3OmniMoeForConditionalGenerationWithLogging
+from qwen_omni_utils import process_mm_info
+from tqdm import tqdm
+from transformers import Qwen3OmniMoeProcessor
+
+MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
+# MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Thinking"
+
+
+def load_prompts(prompts_file: str) -> list[str]:
+    """Load prompts from a text file, one prompt per line."""
+    prompts = []
+    with open(prompts_file, encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                prompts.append(line)
+    return prompts
+
+
+def run_benchmark(
+    model,
+    processor,
+    prompts: list[str],
+    output_dir: str = "benchmark_results",
+    speaker: str = "Ethan",
+    use_audio_in_video: bool = True,
+):
+    """
+    Run benchmark on a list of prompts and collect performance stats.
+
+    Args:
+        model: The Qwen3OmniMoe model
+        processor: The Qwen3OmniMoe processor
+        prompts: List of text prompts to process
+        output_dir: Directory to save results
+        speaker: Speaker voice for audio output
+        use_audio_in_video: Whether to use audio in video
+
+    Returns:
+        tuple: (aggregated_stats, results, audio_outputs)
+            - aggregated_stats: dict with aggregated performance statistics
+            - results: list of dicts with per-prompt results
+            - audio_outputs: list of audio tensors/arrays (or None if no audio)
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    audio_dir = os.path.join(output_dir, "audio")
+    os.makedirs(audio_dir, exist_ok=True)
+
+    all_stats = []
+    results = []
+    audio_outputs = []
+
+    for idx, prompt in enumerate(tqdm(prompts, desc="Processing prompts")):
+        conversation = [
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": prompt}],
+            },
+        ]
+
+        # Preparation for inference
+        text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+        audios, images, videos = process_mm_info(conversation, use_audio_in_video=use_audio_in_video)
+        inputs = processor(
+            text=text,
+            audio=audios,
+            images=images,
+            videos=videos,
+            return_tensors="pt",
+            padding=True,
+            use_audio_in_video=use_audio_in_video,
+        )
+        inputs = inputs.to(model.device).to(model.dtype)
+
+        # Inference: Generation of the output text and audio
+        text_ids, audio = model.generate(
+            **inputs, speaker=speaker, thinker_return_dict_in_generate=True, use_audio_in_video=use_audio_in_video
+        )
+
+        # Decode output text
+        output_text = processor.batch_decode(
+            text_ids.sequences[:, inputs["input_ids"].shape[1] :],
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )[0]
+
+        # Collect performance stats
+        perf_stats = None
+        if hasattr(model, "_perf_stats_last"):
+            perf_stats = model._perf_stats_last.copy()
+            perf_stats["prompt_idx"] = idx
+            perf_stats["prompt"] = prompt
+            all_stats.append(perf_stats)
+
+        # Save audio and collect audio output
+        audio_path = None
+        audio_data = None
+        if audio is not None:
+            audio_data = audio.reshape(-1).detach().cpu().numpy()
+            audio_path = os.path.join(audio_dir, f"output_{idx:04d}.wav")
+            sf.write(
+                audio_path,
+                audio_data,
+                samplerate=24000,
+            )
+            audio_outputs.append(audio_data)
+        else:
+            audio_outputs.append(None)
+
+        # Save result
+        result = {
+            "idx": idx,
+            "prompt": prompt,
+            "output": output_text,
+            "audio_path": audio_path,
+            "perf_stats": perf_stats,
+        }
+        results.append(result)
+
+    # Aggregate statistics
+    aggregated_stats = aggregate_stats(all_stats)
+
+    # Save all results
+    results_path = os.path.join(output_dir, "results.json")
+    with open(results_path, "w", encoding="utf-8") as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+
+    # Save aggregated stats
+    stats_path = os.path.join(output_dir, "perf_stats.json")
+    with open(stats_path, "w", encoding="utf-8") as f:
+        json.dump({"aggregated": aggregated_stats, "per_prompt": all_stats}, f, ensure_ascii=False, indent=2)
+
+    # Count saved audio files
+    num_audio_saved = sum(1 for a in audio_outputs if a is not None)
+    print(f"\nSaved {num_audio_saved} audio files to {audio_dir}/")
+
+    return aggregated_stats, results, audio_outputs
+
+
+def aggregate_stats(all_stats: list[dict]) -> dict:
+    """Aggregate performance statistics from multiple runs."""
+    if not all_stats:
+        return {}
+
+    keys = [
+        "thinker_tokens",
+        "thinker_time_s",
+        "thinker_tps",
+        "talker_tokens",
+        "talker_time_s",
+        "talker_tps",
+        "code2wav_tokens",
+        "code2wav_time_s",
+        "code2wav_tps",
+        "total_tokens",
+        "total_time_s",
+        "total_tps",
+    ]
+
+    aggregated = {
+        "num_samples": len(all_stats),
+    }
+
+    for key in keys:
+        values = [s.get(key, 0) for s in all_stats if key in s]
+        if values:
+            aggregated[f"{key}_sum"] = sum(values)
+            aggregated[f"{key}_avg"] = sum(values) / len(values)
+            aggregated[f"{key}_min"] = min(values)
+            aggregated[f"{key}_max"] = max(values)
+
+    # Calculate overall throughput
+    total_tokens = aggregated.get("total_tokens_sum", 0)
+    total_time = aggregated.get("total_time_s_sum", 0)
+    if total_time > 0:
+        aggregated["overall_tps"] = total_tokens / total_time
+
+    return aggregated
+
+
+def print_stats(stats: dict):
+    """Print performance statistics in a formatted way."""
+    print("\n" + "=" * 60)
+    print("Performance Statistics Summary")
+    print("=" * 60)
+
+    print(f"\nNumber of samples: {stats.get('num_samples', 0)}")
+
+    print("\n--- Thinker ---")
+    print(f"  Total tokens:  {stats.get('thinker_tokens_sum', 0):.0f}")
+    print(f"  Total time:    {stats.get('thinker_time_s_sum', 0):.2f}s")
+    print(f"  Avg TPS:       {stats.get('thinker_tps_avg', 0):.2f}")
+    print(f"  Min TPS:       {stats.get('thinker_tps_min', 0):.2f}")
+    print(f"  Max TPS:       {stats.get('thinker_tps_max', 0):.2f}")
+
+    print("\n--- Talker ---")
+    print(f"  Total tokens:  {stats.get('talker_tokens_sum', 0):.0f}")
+    print(f"  Total time:    {stats.get('talker_time_s_sum', 0):.2f}s")
+    print(f"  Avg TPS:       {stats.get('talker_tps_avg', 0):.2f}")
+    print(f"  Min TPS:       {stats.get('talker_tps_min', 0):.2f}")
+    print(f"  Max TPS:       {stats.get('talker_tps_max', 0):.2f}")
+
+    print("\n--- Code2Wav ---")
+    print(f"  Total tokens:  {stats.get('code2wav_tokens_sum', 0):.0f}")
+    print(f"  Total time:    {stats.get('code2wav_time_s_sum', 0):.2f}s")
+    print(f"  Avg TPS:       {stats.get('code2wav_tps_avg', 0):.2f}")
+    print(f"  Min TPS:       {stats.get('code2wav_tps_min', 0):.2f}")
+    print(f"  Max TPS:       {stats.get('code2wav_tps_max', 0):.2f}")
+
+    print("\n--- Overall ---")
+    print(f"  Total tokens:  {stats.get('total_tokens_sum', 0):.0f}")
+    print(f"  Total time:    {stats.get('total_time_s_sum', 0):.2f}s")
+    print(f"  Overall TPS:   {stats.get('overall_tps', 0):.2f}")
+    print(f"  Avg TPS:       {stats.get('total_tps_avg', 0):.2f}")
+    print(f"  Min TPS:       {stats.get('total_tps_min', 0):.2f}")
+    print(f"  Max TPS:       {stats.get('total_tps_max', 0):.2f}")
+
+    print("=" * 60 + "\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Qwen3-Omni Benchmark Script")
+    parser.add_argument(
+        "--prompts_file",
+        type=str,
+        default="benchmark/build_dataset/top100.txt",
+        help="Path to the prompts file (one prompt per line)",
+    )
+    parser.add_argument(
+        "--output_dir", type=str, default="benchmark_results", help="Directory to save benchmark results"
+    )
+    parser.add_argument("--model_path", type=str, default=MODEL_PATH, help="Path to the model")
+    parser.add_argument("--speaker", type=str, default="Ethan", help="Speaker voice for audio output")
+    parser.add_argument("--num_prompts", type=int, default=None, help="Number of prompts to process (default: all)")
+    args = parser.parse_args()
+
+    # Load model and processor
+    print(f"Loading model from {args.model_path}...")
+    model = Qwen3OmniMoeForConditionalGenerationWithLogging.from_pretrained(
+        args.model_path,
+        dtype="auto",
+        device_map="auto",
+        attn_implementation="flash_attention_2",
+    )
+    processor = Qwen3OmniMoeProcessor.from_pretrained(args.model_path)
+
+    # Benchmark mode
+    print(f"Loading prompts from {args.prompts_file}...")
+    prompts = load_prompts(args.prompts_file)
+
+    if args.num_prompts:
+        prompts = prompts[: args.num_prompts]
+
+    print(f"Running benchmark on {len(prompts)} prompts...")
+
+    aggregated_stats, results, audio_outputs = run_benchmark(
+        model=model,
+        processor=processor,
+        prompts=prompts,
+        output_dir=args.output_dir,
+        speaker=args.speaker,
+    )
+
+    print_stats(aggregated_stats)
+    print(f"\nResults saved to {args.output_dir}/")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/qwen3-omni/vllm-omni-vs-hf.png
+++ b/benchmarks/qwen3-omni/vllm-omni-vs-hf.png
--- a/benchmarks/qwen3-omni/vllm_omni/eval_qwen3_moe_omni.sh
+++ b/benchmarks/qwen3-omni/vllm_omni/eval_qwen3_moe_omni.sh
+#!/bin/bash
+# Qwen3-Omni Benchmark Evaluation Script
+# This script must be run from the vllm-omni root directory
+
+# Get the directory where this script is located
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Navigate to vllm-omni root directory (4 levels up from script location)
+VLLM_OMNI_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+cd "$VLLM_OMNI_ROOT" || { echo "Error: Failed to navigate to vllm-omni directory"; exit 1; }
+
+echo "Working directory: $(pwd)"
+
+# Verify we're in the correct directory and run benchmark
+if [[ ! -d "benchmarks/qwen3-omni/vllm_omni" ]]; then
+    echo "Error: Not in vllm-omni root directory. Please run from vllm-omni folder."
+else
+    log_dir=benchmarks/qwen3-omni/vllm_omni/logs
+    outputs_dir=benchmarks/qwen3-omni/vllm_omni/outputs
+    end2end_script_path=examples/offline_inference/qwen3_omni/end2end.py
+    build_dataset_path=benchmarks/build_dataset/top100.txt
+
+    python $end2end_script_path --output-wav $outputs_dir \
+                      --query-type text \
+                      --txt-prompts $build_dataset_path \
+                      --enable-stats \
+                      --log-dir $log_dir
+    echo "Logs and outputs are saved in ${log_dir} and ${outputs_dir} respectively:"
+    echo "  - omni_llm_pipeline_text                       run dir/base name"
+    echo "  - omni_llm_pipeline_text.orchestrator.stats.jsonl  orchestrator-stage latency stats"
+    echo "  - omni_llm_pipeline_text.overall.stats.jsonl       overall latency/TPS stats"
+    echo "  - omni_llm_pipeline_text.stage0.log                per-stage detailed logs"
+    echo "  - omni_llm_pipeline_text.stage1.log"
+    echo "  - omni_llm_pipeline_text.stage2.log"
+    echo "Key checks: overall.stats.jsonl for end-to-end latency/TPS; orchestrator.stats.jsonl for stable per-stage latency; stage*.log for errors or long tails."
+    echo "  - outputs/             Generated txt and wav files, there should be 100 text and wav files generated respectively"
+fi
--- a/collect_env.py
+++ b/collect_env.py
--- a/docker/Dockerfile.ci
+++ b/docker/Dockerfile.ci
+ARG VLLM_BASE_IMAGE=vllm/vllm-openai
+ARG VLLM_BASE_TAG=v0.15.0
+FROM ${VLLM_BASE_IMAGE}:${VLLM_BASE_TAG}
+ARG APP_DIR=/workspace/vllm-omni
+WORKDIR ${APP_DIR}
+
+COPY . .
+
+# Install system dependencies
+RUN apt-get update && \
+    apt-get install -y ffmpeg && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install vllm-omni into the same uv-managed Python environment used by the base image.
+RUN uv pip install --python "$(python3 -c 'import sys; print(sys.executable)')" --no-cache-dir ".[dev]"
+
+RUN ln -sf /usr/bin/python3 /usr/bin/python
+
+ENTRYPOINT []
--- a/docker/Dockerfile.ci.npu
+++ b/docker/Dockerfile.ci.npu
+ARG VLLM_ASCEND_IMAGE=quay.nju.edu.cn/ascend/vllm-ascend
+ARG VLLM_ASCEND_TAG=v0.11.0rc2
+FROM ${VLLM_ASCEND_IMAGE}:${VLLM_ASCEND_TAG}
+
+ARG APP_DIR=/vllm-workspace/vllm-omni
+WORKDIR ${APP_DIR}
+
+COPY . .
+
+# Install vllm-omni with dev dependencies
+RUN pip install --no-cache-dir -e ".[dev]"
+
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+
+ENTRYPOINT []
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
+ARG BASE_IMAGE=vllm/vllm-openai-rocm:v0.15.0
+FROM ${BASE_IMAGE} AS final
+
+ARG COMMON_WORKDIR=/app
+
+WORKDIR ${COMMON_WORKDIR}
+
+# Step 1: Setup - Install system dependencies
+RUN apt-get update && \
+    apt-get install -y ffmpeg && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN mkdir -p ${COMMON_WORKDIR}/vllm-omni
+
+# Step 2: Copy vllm-omni code and install without uv
+COPY . ${COMMON_WORKDIR}/vllm-omni
+RUN cd ${COMMON_WORKDIR}/vllm-omni && uv pip install --python "$(python3 -c 'import sys; print(sys.executable)')" --no-cache-dir ".[dev]"
+
+# When we are installing onnxruntime-rocm, we need to uninstall the system-installed onnxruntime first.
+# These are the dependencies of Qwen3-TTS.
+RUN uv pip uninstall onnxruntime --system && uv pip install --no-cache-dir onnxruntime-rocm sox --system
+
+RUN ln -sf /usr/bin/python3 /usr/bin/python
+
+CMD ["/bin/bash"]
+
+ENTRYPOINT []
+
+#Set entrypoint for vllm-openai official images
+FROM final AS vllm-openai
+ENTRYPOINT ["vllm", "serve", "--omni"]
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
+nav:
+- Home: README.md
+- User Guide:
+  - Getting Started:
+    - getting_started/quickstart.md
+    - getting_started/installation/*
+  - Serving:
+    - OpenAI-Compatible API:
+      - Image Generation: serving/image_generation_api.md
+      - Image Edit: serving/image_edit_api.md
+  - Examples:
+    - examples/README.md
+    - Offline Inference:
+      - Image-To-Image: user_guide/examples/offline_inference/image_to_image.md
+      - Image-To-Video: user_guide/examples/offline_inference/image_to_video.md
+      - Qwen2.5-Omni: user_guide/examples/offline_inference/qwen2_5_omni.md
+      - Qwen3-Omni: user_guide/examples/offline_inference/qwen3_omni.md
+      - Qwen3-TTS Offline Inference: user_guide/examples/offline_inference/qwen3_tts.md
+      - Text-To-Image: user_guide/examples/offline_inference/text_to_image.md
+      - Text-To-Video: user_guide/examples/offline_inference/text_to_video.md
+    - Online Serving:
+      - Image-To-Image: user_guide/examples/online_serving/image_to_image.md
+      - Qwen2.5-Omni: user_guide/examples/online_serving/qwen2_5_omni.md
+      - Qwen3-Omni: user_guide/examples/online_serving/qwen3_omni.md
+      - Text-To-Image: user_guide/examples/online_serving/text_to_image.md
+  - General:
+    - usage/*
+  - Configuration:
+    - configuration/README.md
+    - configuration/*
+  - Models:
+    - models/supported_models.md
+  - Features:
+    - Sleep Mode: features/sleep_mode.md
+    - Diffusion Features:
+      - Overview: user_guide/diffusion_acceleration.md
+      - TeaCache: user_guide/diffusion/teacache.md
+      - Cache-DiT: user_guide/diffusion/cache_dit_acceleration.md
+      - Parallelism Acceleration: user_guide/diffusion/parallelism_acceleration.md
+      - CPU Offloading: user_guide/diffusion/cpu_offload_diffusion.md
+- Developer Guide:
+  - General:
+    - contributing/README.md
+    - glob: contributing/*
+      flatten_single_child_sections: true
+  - Model Implementation:
+    - contributing/model/README.md
+    - contributing/model/adding_omni_model.md
+    - contributing/model/adding_diffusion_model.md
+  - CI: contributing/ci
+  - Design Documents:
+    - design/index.md
+    - design/architecture_overview.md
+    - Feature Design:
+      - design/feature/disaggregated_inference.md
+      - design/feature/ray_based_execution.md
+    - Module Design:
+      - design/module/ar_module.md
+      - design/module/dit_module.md
+      - design/module/entrypoint_module.md
+  - Docs Guide: contributing/DOCS_GUIDE.md
+- API Reference:
+  - api/README.md
+  - api/vllm_omni
+- CLI Reference: cli
+- Community:
+  - community/*
+  - Slack: https://slack.vllm.ai
+  - Blog: https://blog.vllm.ai
+  - Forum: https://discuss.vllm.ai
--- a/docs/README.md
+++ b/docs/README.md
+---
+hide:
+  - navigation
+  - toc
+---
+
+# Welcome to vLLM-Omni
+
+<p align="center">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" src="./source/logos/vllm-omni-logo.png">
+    <img alt="vllm-omni" src="./source/logos/vllm-omni-logo.png" width=55%>
+  </picture>
+</p>
+<h3 align="center">
+Easy, fast, and cheap omni-modality model serving for everyone
+</h3>
+
+<p style="text-align:center">
+<script async defer src="https://buttons.github.io/buttons.js"></script>
+<a class="github-button" href="https://github.com/vllm-project/vllm-omni" data-show-count="true" data-size="large" aria-label="Star">Star</a>
+<a class="github-button" href="https://github.com/vllm-project/vllm-omni/subscription" data-show-count="true" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
+<a class="github-button" href="https://github.com/vllm-project/vllm-omni/fork" data-show-count="true" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
+</p>
+
+
+## About
+
+[vLLM](https://github.com/vllm-project/vllm) was originally designed to support large language models for text-based autoregressive generation tasks. vLLM-Omni is a framework that extends its support for omni-modality model inference and serving:
+
+- **Omni-modality**: Text, image, video, and audio data processing
+- **Non-autoregressive Architectures**: extend the AR support of vLLM to Diffusion Transformers (DiT) and other parallel generation models
+- **Heterogeneous outputs**: from traditional text generation to multimodal outputs
+
+<p align="center">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" src="./source/architecture/omni-modality-model-architecture.png">
+    <img alt="vllm-omni-arch" src="./source/architecture/omni-modality-model-architecture.png" width=55%>
+  </picture>
+</p>
+
+vLLM-Omni is fast with:
+
+- State-of-the-art AR support by leveraging efficient KV cache management from vLLM
+- Pipelined stage execution overlapping for high throughput performance
+- Fully disaggregation based on OmniConnector and dynamic resource allocation across stages
+
+vLLM-Omni is flexible and easy to use with:
+
+- Heterogeneous pipeline abstraction to manage complex model workflows
+- Seamless integration with popular Hugging Face models
+- Tensor, pipeline, data and expert parallelism support for distributed inference
+- Streaming outputs
+- OpenAI-compatible API server
+
+vLLM-Omni seamlessly supports most popular open-source models on HuggingFace, including:
+
+- Omni-modality models (e.g. Qwen2.5-Omni, Qwen3-Omni)
+- Multi-modality generation models (e.g. Qwen-Image)
+
+For more information, checkout the following:
+
+- [vllm-omni architecture design and recent roadmaps](https://docs.google.com/presentation/d/1qv4qMW1rKAqDREMXiUDLIgqqHQe7TDPj/edit?usp=sharing&ouid=110473603432222024453&rtpof=true&sd=true)
+- [vllm-omni announcement blogpost](https://blog.vllm.ai/2025/11/30/vllm-omni.html)
--- a/docs/api/README.md
+++ b/docs/api/README.md
+# Summary
+
+## Entry Points
+
+Main entry points for vLLM-Omni inference and serving.
+
+- [vllm_omni.entrypoints.async_omni.AsyncOmni][]
+- [vllm_omni.entrypoints.async_omni_diffusion.AsyncOmniDiffusion][]
+- [vllm_omni.entrypoints.async_omni_llm.AsyncOmniLLM][]
+- [vllm_omni.entrypoints.chat_utils.OmniAsyncMultiModalContentParser][]
+- [vllm_omni.entrypoints.chat_utils.OmniAsyncMultiModalItemTracker][]
+- [vllm_omni.entrypoints.chat_utils.parse_chat_messages_futures][]
+- [vllm_omni.entrypoints.cli.serve.OmniServeCommand][]
+- [vllm_omni.entrypoints.client_request_state.ClientRequestState][]
+- [vllm_omni.entrypoints.log_utils.OrchestratorMetrics][]
+- [vllm_omni.entrypoints.log_utils.StageRequestMetrics][]
+- [vllm_omni.entrypoints.log_utils.StageStats][]
+- [vllm_omni.entrypoints.omni.Omni][]
+- [vllm_omni.entrypoints.omni.OmniBase][]
+- [vllm_omni.entrypoints.omni_diffusion.OmniDiffusion][]
+- [vllm_omni.entrypoints.omni_llm.OmniLLM][]
+- [vllm_omni.entrypoints.omni_stage.OmniStage][]
+- [vllm_omni.entrypoints.stage_utils.OmniStageTaskType][]
+
+## Inputs
+
+Input data structures for multi-modal inputs.
+
+- [vllm_omni.inputs.data.OmniEmbedsPrompt][]
+- [vllm_omni.inputs.data.OmniTokenInputs][]
+- [vllm_omni.inputs.data.OmniTokensPrompt][]
+- [vllm_omni.inputs.parse.parse_singleton_prompt_omni][]
+- [vllm_omni.inputs.preprocess.OmniInputPreprocessor][]
+
+## Engine
+
+Engine classes for offline and online inference.
+
+- [vllm_omni.diffusion.diffusion_engine.DiffusionEngine][]
+- [vllm_omni.engine.AdditionalInformationEntry][]
+- [vllm_omni.engine.AdditionalInformationPayload][]
+- [vllm_omni.engine.OmniEngineCoreOutput][]
+- [vllm_omni.engine.OmniEngineCoreOutputs][]
+- [vllm_omni.engine.OmniEngineCoreRequest][]
+- [vllm_omni.engine.PromptEmbedsPayload][]
+- [vllm_omni.engine.arg_utils.AsyncOmniEngineArgs][]
+- [vllm_omni.engine.arg_utils.OmniEngineArgs][]
+- [vllm_omni.engine.input_processor.OmniInputProcessor][]
+- [vllm_omni.engine.output_processor.MultimodalOutputProcessor][]
+- [vllm_omni.engine.output_processor.OmniRequestState][]
+
+## Core
+
+Core scheduling and caching components.
+
+- [vllm_omni.core.sched.omni_ar_scheduler.KVCacheTransferData][]
+- [vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler][]
+- [vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler][]
+- [vllm_omni.core.sched.output.OmniCachedRequestData][]
+- [vllm_omni.core.sched.output.OmniNewRequestData][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.DistributedGroupResidualVectorQuantization][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.DistributedResidualVectorQuantization][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.EuclideanCodebook][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.VectorQuantization][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.preprocess][]
+
+## Configuration
+
+Configuration classes.
+
+- [vllm_omni.config.model.OmniModelConfig][]
+- [vllm_omni.diffusion.cache.teacache.config.TeaCacheConfig][]
+- [vllm_omni.distributed.omni_connectors.utils.config.ConnectorSpec][]
+- [vllm_omni.distributed.omni_connectors.utils.config.OmniTransferConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts.Qwen3TTSConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts.Qwen3TTSSpeakerEncoderConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts.Qwen3TTSTalkerCodePredictorConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts.Qwen3TTSTalkerConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_12hz.configuration_qwen3_tts_tokenizer_v2.Qwen3TTSTokenizerV2Config][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_12hz.configuration_qwen3_tts_tokenizer_v2.Qwen3TTSTokenizerV2DecoderConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1Config][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1DecoderBigVGANConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1DecoderConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1DecoderDiTConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1EncoderConfig][]
+
+## Workers
+
+Worker classes and model runners for distributed inference.
+
+- [vllm_omni.diffusion.worker.gpu_diffusion_model_runner.GPUDiffusionModelRunner][]
+- [vllm_omni.diffusion.worker.gpu_diffusion_worker.GPUDiffusionWorker][]
+- [vllm_omni.diffusion.worker.gpu_diffusion_worker.WorkerProc][]
+- [vllm_omni.diffusion.worker.npu.npu_worker.NPUWorker][]
+- [vllm_omni.diffusion.worker.npu.npu_worker.NPUWorkerProc][]
+- [vllm_omni.worker.gpu_ar_model_runner.ExecuteModelState][]
+- [vllm_omni.worker.gpu_ar_model_runner.GPUARModelRunner][]
+- [vllm_omni.worker.gpu_ar_worker.GPUARWorker][]
+- [vllm_omni.worker.gpu_generation_model_runner.GPUGenerationModelRunner][]
+- [vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker][]
+- [vllm_omni.worker.gpu_model_runner.OmniGPUModelRunner][]
+- [vllm_omni.worker.npu.npu_ar_model_runner.ExecuteModelState][]
+- [vllm_omni.worker.npu.npu_ar_model_runner.NPUARModelRunner][]
+- [vllm_omni.worker.npu.npu_ar_worker.NPUARWorker][]
+- [vllm_omni.worker.npu.npu_generation_model_runner.NPUGenerationModelRunner][]
+- [vllm_omni.worker.npu.npu_generation_worker.NPUGenerationWorker][]
+- [vllm_omni.worker.npu.npu_model_runner.OmniNPUModelRunner][]
--- a/docs/assets/WeChat.jpg
+++ b/docs/assets/WeChat.jpg
--- a/docs/cli/README.md
+++ b/docs/cli/README.md
+# vLLM-Omni CLI Guide
+
+The CLI for vLLM-Omni inherits from vllm with some additional arguments.
+
+## serve
+
+Starts the vLLM-Omni OpenAI Compatible API server.
+
+Start with a model:
+
+```bash
+vllm serve Qwen/Qwen2.5-Omni-7B --omni
+```
+
+Specify the port:
+
+```bash
+vllm serve Qwen/Qwen2.5-Omni-7B --omni --port 8091
+```
+
+If you have custom stage configs file, launch the server with command below
+```bash
+vllm serve Qwen/Qwen2.5-Omni-7B --omni --stage-configs-path /path/to/stage_configs_file
+```
+
+
+## bench
+
+Run benchmark tests for online serving throughput.
+Available Commands:
+
+```bash
+vllm bench serve --omni \
+    --model Qwen/Qwen2.5-Omni-7B \
+    --host server-host \
+    --port server-port \
+    --random-input-len 32 \
+    --random-output-len 4  \
+    --num-prompts  5
+```
+
+See [vllm bench serve](./bench/serve.md) for the full reference of all available arguments.
--- a/docs/cli/bench/serve.md
+++ b/docs/cli/bench/serve.md
--- a/docs/community/contact_us.md
+++ b/docs/community/contact_us.md
+# Contact Us
+
+- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm-omni/issues)
+- For coordinating contributions and development and discussing with other users and developers, please join `sig-omni` channel in our [Slack](https://slack.vllm.ai/) or use the [vLLM Forum](https://discuss.vllm.ai/)
+- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm-omni/security/advisories) feature
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
+# Meetups
--- a/docs/community/volunteers.md
+++ b/docs/community/volunteers.md
+# Volunteers for Bugfix and CI
+
+We encourage you to check current docs and [issues](https://github.com/vllm-project/vllm-omni/issues) to find possible solutions for your questions. If non of these can solve it, please propose an issue to describe your questions about bug or CI problems for developing.
+
+If you have urgent need for locating and solving bugfix or CI problems, please find community volunteers below.
+
+| Dec 4-Dec 12 | Dec 15-Dec 19 | Dec 22-Dec 26 | Dec 29- Jan 2, 2026| Jan 5-Jan 9 | Jan 12-Jan 16 |
+|----------|----------|----------|----------|----------|----------|
+| <a href="https://github.com/congw729">Conw729</a> | <a href="https://github.com/yinpeiqi">yinpeiqi</a> | <a href="https://github.com/tzhouam">tzhouam</a> | <a href="https://github.com/SamitHuang">SamitHuang</a> | <a href="https://github.com/gcanlin">gcanlin</a> | <a href="https://github.com/natureofnature">natureofnature</a> |
+| <a href="https://github.com/david6666666">david6666666</a> | <a href="https://github.com/R2-Y">R2-Y</a> | <a href="https://github.com/hsliuustc0106">hsliuustc0106</a> | <a href="https://github.com/Gaohan123">Gaohan123</a> | <a href="https://github.com/ZJY0516">ZJY0516</a> | <a href="https://github.com/qibaoyuan">qibaoyuan</a> |
+
+We kindly welcome more contributors to fix bugs and contribute new features!
--- a/docs/configuration/README.md
+++ b/docs/configuration/README.md
+# Configuration Options
+
+This section lists the most common options for running vLLM-Omni.
+
+For options within a vLLM Engine. Please refer to [vLLM Configuration](https://docs.vllm.ai/en/v0.14.0/configuration/index.html)
+
+Currently, the main options are maintained by stage configs for each model.
+
+For specific example, please refer to [Qwen2.5-omni stage config](stage_configs/qwen2_5_omni.yaml)
+
+For introduction, please check [Introduction for stage config](./stage_configs.md)
+
+## Memory Configuration
+
+- **[GPU Memory Calculation and Configuration](./gpu_memory_utilization.md)** - Guide on how to calculate memory requirements and set up `gpu_memory_utilization` for optimal performance
+
+## Optimization Features
+
+- **[TeaCache Configuration](../user_guide/diffusion/teacache.md)** - Enable TeaCache adaptive caching for DiT models to achieve 1.5x-2.0x speedup with minimal quality loss
+- **[Cache-DiT Configuration](../user_guide/diffusion/cache_dit_acceleration.md)** - Enable Cache-DiT as cache acceleration backends for DiT models
+- **[Parallelism Configuration](../user_guide/diffusion/parallelism_acceleration.md)** - Enable parallelism (e.g., sequence parallelism) for for DiT models
--- a/docs/configuration/gpu_memory_utilization.md
+++ b/docs/configuration/gpu_memory_utilization.md
+# GPU Memory Calculation and Configuration
+
+This guide explains how to calculate GPU memory requirements and properly configure `gpu_memory_utilization` for vLLM-Omni stages.
+
+## Overview
+
+`gpu_memory_utilization` is a critical parameter that controls how much GPU memory each stage can use. It's specified as a fraction between 0.0 and 1.0, where:
+- `0.8` means 80% of the GPU's total memory
+- `1.0` means 100% of the GPU's total memory (not recommended, leaves no buffer)
+
+## How Memory is Calculated
+
+### Memory Allocation Formula
+
+For each stage, vLLM-Omni calculates the requested memory as:
+
+```
+requested_memory = total_gpu_memory × gpu_memory_utilization
+```
+
+The system checks that:
+```
+free_memory ≥ requested_memory
+```
+
+If this condition is not met, the stage will fail to initialize with an error message showing the memory requirements.
+
+### Memory Components
+
+The total memory used by a stage includes:
+
+1. **Model Weights**: The size of the model parameters loaded on the GPU
+2. **KV Cache**: Memory for storing key-value cache during generation
+3. **Activation Memory**: Temporary memory for intermediate computations
+4. **System Overhead**: Memory used by CUDA, PyTorch, and other system components
+5. **Non-Torch Memory**: Memory allocated outside of PyTorch (e.g., CUDA graphs)
+
+### Example Calculation
+
+For a GPU with 80GB total memory:
+- `gpu_memory_utilization: 0.8` → 64GB available for the stage
+- `gpu_memory_utilization: 0.6` → 48GB available for the stage
+- `gpu_memory_utilization: 0.15` → 12GB available for the stage
+
+## Setting Up `gpu_memory_utilization`
+
+### Step 1: Determine GPU Memory
+
+First, check your GPU's total memory:
+
+```bash
+# Using nvidia-smi
+nvidia-smi --query-gpu=memory.total --format=csv
+
+# Or using Python
+python -c "import torch; print(f'{torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')"
+```
+
+### Step 2: Estimate Model Memory Requirements
+
+#### For Autoregressive (AR) Stages
+
+AR stages typically need more memory due to:
+- Large model weights
+- KV cache for attention
+- Activation buffers
+
+#### For Diffusion/Generation Stages
+
+Diffusion stages (like code2wav) typically need less memory:
+- Smaller model components
+- Different memory access patterns
+
+**Typical values:**
+- `0.1 - 0.3` for most diffusion stages
+
+### Step 3: Consider Multi-Stage Scenarios
+
+When multiple stages share the same GPU, you must ensure the sum of their `gpu_memory_utilization` values doesn't exceed 1.0.
+
+**Example: Two stages on GPU 0**
+```yaml
+stage_args:
+  - stage_id: 0
+    runtime:
+      devices: "0"
+    engine_args:
+      gpu_memory_utilization: 0.6  # Uses 60% of GPU 0
+
+  - stage_id: 1
+    runtime:
+      devices: "0"
+    engine_args:
+      gpu_memory_utilization: 0.3  # Uses 30% of GPU 0
+      # Total: 90% of GPU 0 (safe, leaves 10% buffer)
+```
+
+**Important:** If stages run on different GPUs, each can use up to 1.0 independently.
+
+### Step 4: Account for Tensor Parallelism
+
+When using `tensor_parallel_size > 1`, the model is split across multiple GPUs, so each GPU needs less memory.
+
+**Example: 2-way tensor parallelism**
+```yaml
+stage_args:
+  - stage_id: 0
+    runtime:
+      devices: "0,1"  # Uses both GPUs
+    engine_args:
+      tensor_parallel_size: 2
+      gpu_memory_utilization: 0.6  # 60% per GPU
+      # Model is split, so each GPU uses ~30% of model memory
+```
+
+## Examples
+
+### Qwen3-Omni-MoE on 2x H100-80GB
+
+```yaml
+stage_args:
+  - stage_id: 0  # Thinker stage with TP=2
+    runtime:
+      devices: "0,1"
+    engine_args:
+      tensor_parallel_size: 2
+      gpu_memory_utilization: 0.6  # 48GB per GPU
+
+  - stage_id: 1  # Talker stage
+    runtime:
+      devices: "1"
+    engine_args:
+      gpu_memory_utilization: 0.3  # 24GB on GPU 1
+
+  - stage_id: 2  # Code2Wav stage
+    runtime:
+      devices: "0"
+    engine_args:
+      gpu_memory_utilization: 0.1  # 8GB on GPU 0
+```
+**Note:** In this configuration, stages 0 and 2 share GPU 0, but they run at different times in the pipeline, so their memory usage doesn't overlap.
+
+## Troubleshooting
+
+### Error: "Free memory is less than desired GPU memory utilization"
+
+This means the GPU doesn't have enough free memory when the stage starts.
+
+**Solutions:**
+1. Free up memory by closing other processes
+2. Reduce `gpu_memory_utilization` for this stage
+3. Use a GPU with more memory
+4. Move the stage to a different GPU
+
+### Error: OOM during inference
+
+The stage initialized but ran out of memory during processing.
+
+**Solutions:**
+1. Reduce `max_num_batched_tokens`
+2. Reduce `max_batch_size` in runtime config
+3. Lower `gpu_memory_utilization` slightly
+4. Enable quantization if supported
+
+### Memory Not Fully Utilized
+
+If you see low memory usage, you can:
+1. Increase `gpu_memory_utilization` to allow larger KV cache
+2. Increase `max_num_batched_tokens` for better batching
+3. Check if other stages are limiting throughput
+
+## Useful formula for Memory Calculation
+
+### KV Cache Memory
+
+The KV cache size depends on:
+- Number of sequences in batch
+- Sequence length (prompt + generation)
+- Model hidden size
+- Number of attention heads
+- Number of layers
+
+approximate Formula:
+```
+kv_cache_memory ≈ batch_size × seq_len × hidden_size × num_layers × 2 × dtype_size
+```
+2 for k & v
+
+### Model Weight Memory
+
+```
+model_memory ≈ num_parameters × dtype_size
+```
+
+For example:
+- 7B parameters in FP16: ~14GB
+- 7B parameters in FP32: ~28GB
+- 7B parameters in INT8: ~7GB
+
+### Activation Memory
+
+Activation memory is typically smaller but varies with:
+- Batch size
+- Sequence length
+- Model architecture
+
+It's usually 10-30% of model weight memory during inference.
--- a/docs/configuration/stage_configs.md
+++ b/docs/configuration/stage_configs.md
--- a/docs/configuration/stage_configs/qwen2_5_omni.yaml
+++ b/docs/configuration/stage_configs/qwen2_5_omni.yaml