vllm-omni_0.15.0.rc1+fix1 first commit

c1cacde6 · weishb · 35607782 · c1cacde6 · c1cacde6 · c1cacde6
Commit c1cacde6 authored Mar 25, 2026 by weishb
20 changed files
--- a/benchmarks/qwen3-omni/transformers/qwen3_omni_moe_transformers.py
+++ b/benchmarks/qwen3-omni/transformers/qwen3_omni_moe_transformers.py
+import argparse
+import json
+import os
+
+import soundfile as sf
+from qwen3_omni_moe_model import Qwen3OmniMoeForConditionalGenerationWithLogging
+from qwen_omni_utils import process_mm_info
+from tqdm import tqdm
+from transformers import Qwen3OmniMoeProcessor
+
+MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
+# MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Thinking"
+
+
+def load_prompts(prompts_file: str) -> list[str]:
+    """Load prompts from a text file, one prompt per line."""
+    prompts = []
+    with open(prompts_file, encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                prompts.append(line)
+    return prompts
+
+
+def run_benchmark(
+    model,
+    processor,
+    prompts: list[str],
+    output_dir: str = "benchmark_results",
+    speaker: str = "Ethan",
+    use_audio_in_video: bool = True,
+):
+    """
+    Run benchmark on a list of prompts and collect performance stats.
+
+    Args:
+        model: The Qwen3OmniMoe model
+        processor: The Qwen3OmniMoe processor
+        prompts: List of text prompts to process
+        output_dir: Directory to save results
+        speaker: Speaker voice for audio output
+        use_audio_in_video: Whether to use audio in video
+
+    Returns:
+        tuple: (aggregated_stats, results, audio_outputs)
+            - aggregated_stats: dict with aggregated performance statistics
+            - results: list of dicts with per-prompt results
+            - audio_outputs: list of audio tensors/arrays (or None if no audio)
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    audio_dir = os.path.join(output_dir, "audio")
+    os.makedirs(audio_dir, exist_ok=True)
+
+    all_stats = []
+    results = []
+    audio_outputs = []
+
+    for idx, prompt in enumerate(tqdm(prompts, desc="Processing prompts")):
+        conversation = [
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": prompt}],
+            },
+        ]
+
+        # Preparation for inference
+        text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+        audios, images, videos = process_mm_info(conversation, use_audio_in_video=use_audio_in_video)
+        inputs = processor(
+            text=text,
+            audio=audios,
+            images=images,
+            videos=videos,
+            return_tensors="pt",
+            padding=True,
+            use_audio_in_video=use_audio_in_video,
+        )
+        inputs = inputs.to(model.device).to(model.dtype)
+
+        # Inference: Generation of the output text and audio
+        text_ids, audio = model.generate(
+            **inputs, speaker=speaker, thinker_return_dict_in_generate=True, use_audio_in_video=use_audio_in_video
+        )
+
+        # Decode output text
+        output_text = processor.batch_decode(
+            text_ids.sequences[:, inputs["input_ids"].shape[1] :],
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )[0]
+
+        # Collect performance stats
+        perf_stats = None
+        if hasattr(model, "_perf_stats_last"):
+            perf_stats = model._perf_stats_last.copy()
+            perf_stats["prompt_idx"] = idx
+            perf_stats["prompt"] = prompt
+            all_stats.append(perf_stats)
+
+        # Save audio and collect audio output
+        audio_path = None
+        audio_data = None
+        if audio is not None:
+            audio_data = audio.reshape(-1).detach().cpu().numpy()
+            audio_path = os.path.join(audio_dir, f"output_{idx:04d}.wav")
+            sf.write(
+                audio_path,
+                audio_data,
+                samplerate=24000,
+            )
+            audio_outputs.append(audio_data)
+        else:
+            audio_outputs.append(None)
+
+        # Save result
+        result = {
+            "idx": idx,
+            "prompt": prompt,
+            "output": output_text,
+            "audio_path": audio_path,
+            "perf_stats": perf_stats,
+        }
+        results.append(result)
+
+    # Aggregate statistics
+    aggregated_stats = aggregate_stats(all_stats)
+
+    # Save all results
+    results_path = os.path.join(output_dir, "results.json")
+    with open(results_path, "w", encoding="utf-8") as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+
+    # Save aggregated stats
+    stats_path = os.path.join(output_dir, "perf_stats.json")
+    with open(stats_path, "w", encoding="utf-8") as f:
+        json.dump({"aggregated": aggregated_stats, "per_prompt": all_stats}, f, ensure_ascii=False, indent=2)
+
+    # Count saved audio files
+    num_audio_saved = sum(1 for a in audio_outputs if a is not None)
+    print(f"\nSaved {num_audio_saved} audio files to {audio_dir}/")
+
+    return aggregated_stats, results, audio_outputs
+
+
+def aggregate_stats(all_stats: list[dict]) -> dict:
+    """Aggregate performance statistics from multiple runs."""
+    if not all_stats:
+        return {}
+
+    keys = [
+        "thinker_tokens",
+        "thinker_time_s",
+        "thinker_tps",
+        "talker_tokens",
+        "talker_time_s",
+        "talker_tps",
+        "code2wav_tokens",
+        "code2wav_time_s",
+        "code2wav_tps",
+        "total_tokens",
+        "total_time_s",
+        "total_tps",
+    ]
+
+    aggregated = {
+        "num_samples": len(all_stats),
+    }
+
+    for key in keys:
+        values = [s.get(key, 0) for s in all_stats if key in s]
+        if values:
+            aggregated[f"{key}_sum"] = sum(values)
+            aggregated[f"{key}_avg"] = sum(values) / len(values)
+            aggregated[f"{key}_min"] = min(values)
+            aggregated[f"{key}_max"] = max(values)
+
+    # Calculate overall throughput
+    total_tokens = aggregated.get("total_tokens_sum", 0)
+    total_time = aggregated.get("total_time_s_sum", 0)
+    if total_time > 0:
+        aggregated["overall_tps"] = total_tokens / total_time
+
+    return aggregated
+
+
+def print_stats(stats: dict):
+    """Print performance statistics in a formatted way."""
+    print("\n" + "=" * 60)
+    print("Performance Statistics Summary")
+    print("=" * 60)
+
+    print(f"\nNumber of samples: {stats.get('num_samples', 0)}")
+
+    print("\n--- Thinker ---")
+    print(f"  Total tokens:  {stats.get('thinker_tokens_sum', 0):.0f}")
+    print(f"  Total time:    {stats.get('thinker_time_s_sum', 0):.2f}s")
+    print(f"  Avg TPS:       {stats.get('thinker_tps_avg', 0):.2f}")
+    print(f"  Min TPS:       {stats.get('thinker_tps_min', 0):.2f}")
+    print(f"  Max TPS:       {stats.get('thinker_tps_max', 0):.2f}")
+
+    print("\n--- Talker ---")
+    print(f"  Total tokens:  {stats.get('talker_tokens_sum', 0):.0f}")
+    print(f"  Total time:    {stats.get('talker_time_s_sum', 0):.2f}s")
+    print(f"  Avg TPS:       {stats.get('talker_tps_avg', 0):.2f}")
+    print(f"  Min TPS:       {stats.get('talker_tps_min', 0):.2f}")
+    print(f"  Max TPS:       {stats.get('talker_tps_max', 0):.2f}")
+
+    print("\n--- Code2Wav ---")
+    print(f"  Total tokens:  {stats.get('code2wav_tokens_sum', 0):.0f}")
+    print(f"  Total time:    {stats.get('code2wav_time_s_sum', 0):.2f}s")
+    print(f"  Avg TPS:       {stats.get('code2wav_tps_avg', 0):.2f}")
+    print(f"  Min TPS:       {stats.get('code2wav_tps_min', 0):.2f}")
+    print(f"  Max TPS:       {stats.get('code2wav_tps_max', 0):.2f}")
+
+    print("\n--- Overall ---")
+    print(f"  Total tokens:  {stats.get('total_tokens_sum', 0):.0f}")
+    print(f"  Total time:    {stats.get('total_time_s_sum', 0):.2f}s")
+    print(f"  Overall TPS:   {stats.get('overall_tps', 0):.2f}")
+    print(f"  Avg TPS:       {stats.get('total_tps_avg', 0):.2f}")
+    print(f"  Min TPS:       {stats.get('total_tps_min', 0):.2f}")
+    print(f"  Max TPS:       {stats.get('total_tps_max', 0):.2f}")
+
+    print("=" * 60 + "\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Qwen3-Omni Benchmark Script")
+    parser.add_argument(
+        "--prompts_file",
+        type=str,
+        default="benchmark/build_dataset/top100.txt",
+        help="Path to the prompts file (one prompt per line)",
+    )
+    parser.add_argument(
+        "--output_dir", type=str, default="benchmark_results", help="Directory to save benchmark results"
+    )
+    parser.add_argument("--model_path", type=str, default=MODEL_PATH, help="Path to the model")
+    parser.add_argument("--speaker", type=str, default="Ethan", help="Speaker voice for audio output")
+    parser.add_argument("--num_prompts", type=int, default=None, help="Number of prompts to process (default: all)")
+    args = parser.parse_args()
+
+    # Load model and processor
+    print(f"Loading model from {args.model_path}...")
+    model = Qwen3OmniMoeForConditionalGenerationWithLogging.from_pretrained(
+        args.model_path,
+        dtype="auto",
+        device_map="auto",
+        attn_implementation="flash_attention_2",
+    )
+    processor = Qwen3OmniMoeProcessor.from_pretrained(args.model_path)
+
+    # Benchmark mode
+    print(f"Loading prompts from {args.prompts_file}...")
+    prompts = load_prompts(args.prompts_file)
+
+    if args.num_prompts:
+        prompts = prompts[: args.num_prompts]
+
+    print(f"Running benchmark on {len(prompts)} prompts...")
+
+    aggregated_stats, results, audio_outputs = run_benchmark(
+        model=model,
+        processor=processor,
+        prompts=prompts,
+        output_dir=args.output_dir,
+        speaker=args.speaker,
+    )
+
+    print_stats(aggregated_stats)
+    print(f"\nResults saved to {args.output_dir}/")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/qwen3-omni/vllm-omni-vs-hf.png
+++ b/benchmarks/qwen3-omni/vllm-omni-vs-hf.png
--- a/benchmarks/qwen3-omni/vllm_omni/eval_qwen3_moe_omni.sh
+++ b/benchmarks/qwen3-omni/vllm_omni/eval_qwen3_moe_omni.sh
+#!/bin/bash
+# Qwen3-Omni Benchmark Evaluation Script
+# This script must be run from the vllm-omni root directory
+
+# Get the directory where this script is located
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Navigate to vllm-omni root directory (4 levels up from script location)
+VLLM_OMNI_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+cd "$VLLM_OMNI_ROOT" || { echo "Error: Failed to navigate to vllm-omni directory"; exit 1; }
+
+echo "Working directory: $(pwd)"
+
+# Verify we're in the correct directory and run benchmark
+if [[ ! -d "benchmarks/qwen3-omni/vllm_omni" ]]; then
+    echo "Error: Not in vllm-omni root directory. Please run from vllm-omni folder."
+else
+    log_dir=benchmarks/qwen3-omni/vllm_omni/logs
+    outputs_dir=benchmarks/qwen3-omni/vllm_omni/outputs
+    end2end_script_path=examples/offline_inference/qwen3_omni/end2end.py
+    build_dataset_path=benchmarks/build_dataset/top100.txt
+
+    python $end2end_script_path --output-wav $outputs_dir \
+                      --query-type text \
+                      --txt-prompts $build_dataset_path \
+                      --enable-stats \
+                      --log-dir $log_dir
+    echo "Logs and outputs are saved in ${log_dir} and ${outputs_dir} respectively:"
+    echo "  - omni_llm_pipeline_text                       run dir/base name"
+    echo "  - omni_llm_pipeline_text.orchestrator.stats.jsonl  orchestrator-stage latency stats"
+    echo "  - omni_llm_pipeline_text.overall.stats.jsonl       overall latency/TPS stats"
+    echo "  - omni_llm_pipeline_text.stage0.log                per-stage detailed logs"
+    echo "  - omni_llm_pipeline_text.stage1.log"
+    echo "  - omni_llm_pipeline_text.stage2.log"
+    echo "Key checks: overall.stats.jsonl for end-to-end latency/TPS; orchestrator.stats.jsonl for stable per-stage latency; stage*.log for errors or long tails."
+    echo "  - outputs/             Generated txt and wav files, there should be 100 text and wav files generated respectively"
+fi
--- a/collect_env.py
+++ b/collect_env.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# ruff: noqa
+# code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py
+
+import datetime
+import locale
+import os
+import subprocess
+import sys
+
+# Unlike the rest of the PyTorch this file must be python2 compliant.
+# This script outputs relevant system environment info
+# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
+from collections import namedtuple
+
+import regex as re
+
+from vllm.envs import environment_variables
+
+try:
+    import torch
+
+    TORCH_AVAILABLE = True
+except (ImportError, NameError, AttributeError, OSError):
+    TORCH_AVAILABLE = False
+
+# System Environment Information
+SystemEnv = namedtuple(
+    "SystemEnv",
+    [
+        "torch_version",
+        "is_debug_build",
+        "cuda_compiled_version",
+        "gcc_version",
+        "clang_version",
+        "cmake_version",
+        "os",
+        "libc_version",
+        "python_version",
+        "python_platform",
+        "is_cuda_available",
+        "cuda_runtime_version",
+        "cuda_module_loading",
+        "nvidia_driver_version",
+        "nvidia_gpu_models",
+        "cudnn_version",
+        "pip_version",  # 'pip' or 'pip3'
+        "pip_packages",
+        "conda_packages",
+        "hip_compiled_version",
+        "hip_runtime_version",
+        "miopen_runtime_version",
+        "caching_allocator_config",
+        "is_xnnpack_available",
+        "cpu_info",
+        "rocm_version",  # vllm specific field
+        "vllm_version",  # vllm specific field
+        "vllm_omni_version",  # vllm-omni specific field
+        "vllm_build_flags",  # vllm specific field
+        "gpu_topo",  # vllm specific field
+        "env_vars",
+    ],
+)
+
+DEFAULT_CONDA_PATTERNS = {
+    "torch",
+    "numpy",
+    "cudatoolkit",
+    "soumith",
+    "mkl",
+    "magma",
+    "triton",
+    "optree",
+    "nccl",
+    "transformers",
+    "zmq",
+    "nvidia",
+    "pynvml",
+    "flashinfer-python",
+}
+
+DEFAULT_PIP_PATTERNS = {
+    "torch",
+    "numpy",
+    "mypy",
+    "flake8",
+    "triton",
+    "optree",
+    "onnx",
+    "nccl",
+    "transformers",
+    "zmq",
+    "nvidia",
+    "pynvml",
+    "flashinfer-python",
+}
+
+
+def run(command):
+    """Return (return-code, stdout, stderr)."""
+    shell = True if type(command) is str else False
+    try:
+        p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell)
+        raw_output, raw_err = p.communicate()
+        rc = p.returncode
+        if get_platform() == "win32":
+            enc = "oem"
+        else:
+            enc = locale.getpreferredencoding()
+        output = raw_output.decode(enc)
+        if command == "nvidia-smi topo -m":
+            # don't remove the leading whitespace of `nvidia-smi topo -m`
+            #   because they are meaningful
+            output = output.rstrip()
+        else:
+            output = output.strip()
+        err = raw_err.decode(enc)
+        return rc, output, err.strip()
+
+    except FileNotFoundError:
+        cmd_str = command if isinstance(command, str) else command[0]
+        return 127, "", f"Command not found: {cmd_str}"
+
+
+def run_and_read_all(run_lambda, command):
+    """Run command using run_lambda; reads and returns entire output if rc is 0."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    return out
+
+
+def run_and_parse_first_match(run_lambda, command, regex):
+    """Run command using run_lambda, returns the first regex match if it exists."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    match = re.search(regex, out)
+    if match is None:
+        return None
+    return match.group(1)
+
+
+def run_and_return_first_line(run_lambda, command):
+    """Run command using run_lambda and returns first line if output is not empty."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    return out.split("\n")[0]
+
+
+def get_conda_packages(run_lambda, patterns=None):
+    if patterns is None:
+        patterns = DEFAULT_CONDA_PATTERNS
+    conda = os.environ.get("CONDA_EXE", "conda")
+    out = run_and_read_all(run_lambda, [conda, "list"])
+    if out is None:
+        return out
+
+    return "\n".join(
+        line for line in out.splitlines() if not line.startswith("#") and any(name in line for name in patterns)
+    )
+
+
+def get_gcc_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, "gcc --version", r"gcc (.*)")
+
+
+def get_clang_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, "clang --version", r"clang version (.*)")
+
+
+def get_cmake_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, "cmake --version", r"cmake (.*)")
+
+
+def get_nvidia_driver_version(run_lambda):
+    if get_platform() == "darwin":
+        cmd = "kextstat | grep -i cuda"
+        return run_and_parse_first_match(run_lambda, cmd, r"com[.]nvidia[.]CUDA [(](.*?)[)]")
+    smi = get_nvidia_smi()
+    return run_and_parse_first_match(run_lambda, smi, r"Driver Version: (.*?) ")
+
+
+def get_gpu_info(run_lambda):
+    if get_platform() == "darwin" or (
+        TORCH_AVAILABLE and hasattr(torch.version, "hip") and torch.version.hip is not None
+    ):
+        if TORCH_AVAILABLE and torch.cuda.is_available():
+            if torch.version.hip is not None:
+                prop = torch.cuda.get_device_properties(0)
+                if hasattr(prop, "gcnArchName"):
+                    gcnArch = " ({})".format(prop.gcnArchName)
+                else:
+                    gcnArch = "NoGCNArchNameOnOldPyTorch"
+            else:
+                gcnArch = ""
+            return torch.cuda.get_device_name(None) + gcnArch
+        return None
+    smi = get_nvidia_smi()
+    uuid_regex = re.compile(r" \(UUID: .+?\)")
+    rc, out, _ = run_lambda(smi + " -L")
+    if rc != 0:
+        return None
+    # Anonymize GPUs by removing their UUID
+    return re.sub(uuid_regex, "", out)
+
+
+def get_running_cuda_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, "nvcc --version", r"release .+ V(.*)")
+
+
+def get_cudnn_version(run_lambda):
+    """Return a list of libcudnn.so; it's hard to tell which one is being used."""
+    if get_platform() == "win32":
+        system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
+        cuda_path = os.environ.get("CUDA_PATH", "%CUDA_PATH%")
+        where_cmd = os.path.join(system_root, "System32", "where")
+        cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path)
+    elif get_platform() == "darwin":
+        # CUDA libraries and drivers can be found in /usr/local/cuda/. See
+        # https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install
+        # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac
+        # Use CUDNN_LIBRARY when cudnn library is installed elsewhere.
+        cudnn_cmd = "ls /usr/local/cuda/lib/libcudnn*"
+    else:
+        cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev'
+    rc, out, _ = run_lambda(cudnn_cmd)
+    # find will return 1 if there are permission errors or if not found
+    if len(out) == 0 or (rc != 1 and rc != 0):
+        l = os.environ.get("CUDNN_LIBRARY")
+        if l is not None and os.path.isfile(l):
+            return os.path.realpath(l)
+        return None
+    files_set = set()
+    for fn in out.split("\n"):
+        fn = os.path.realpath(fn)  # eliminate symbolic links
+        if os.path.isfile(fn):
+            files_set.add(fn)
+    if not files_set:
+        return None
+    # Alphabetize the result because the order is non-deterministic otherwise
+    files = sorted(files_set)
+    if len(files) == 1:
+        return files[0]
+    result = "\n".join(files)
+    return "Probably one of the following:\n{}".format(result)
+
+
+def get_nvidia_smi():
+    # Note: nvidia-smi is currently available only on Windows and Linux
+    smi = "nvidia-smi"
+    if get_platform() == "win32":
+        system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
+        program_files_root = os.environ.get("PROGRAMFILES", "C:\\Program Files")
+        legacy_path = os.path.join(program_files_root, "NVIDIA Corporation", "NVSMI", smi)
+        new_path = os.path.join(system_root, "System32", smi)
+        smis = [new_path, legacy_path]
+        for candidate_smi in smis:
+            if os.path.exists(candidate_smi):
+                smi = '"{}"'.format(candidate_smi)
+                break
+    return smi
+
+
+def get_rocm_version(run_lambda):
+    """Returns the ROCm version if available, otherwise 'N/A'."""
+    return run_and_parse_first_match(run_lambda, "hipcc --version", r"HIP version: (\S+)")
+
+
+def get_vllm_version():
+    from vllm import __version__, __version_tuple__
+
+    if __version__ == "dev":
+        return "N/A (dev)"
+    version_str = __version_tuple__[-1]
+    if isinstance(version_str, str) and version_str.startswith("g"):
+        # it's a dev build
+        if "." in version_str:
+            # it's a dev build containing local changes
+            git_sha = version_str.split(".")[0][1:]
+            date = version_str.split(".")[-1][1:]
+            return f"{__version__} (git sha: {git_sha}, date: {date})"
+        else:
+            # it's a dev build without local changes
+            git_sha = version_str[1:]  # type: ignore
+            return f"{__version__} (git sha: {git_sha})"
+    return __version__
+
+
+def get_vllm_omni_version(run_lambda):
+    try:
+        import vllm_omni
+        from vllm_omni import __version__, __version_tuple__
+
+        version_str = __version_tuple__[-1]
+        if isinstance(version_str, str) and version_str.startswith("g"):
+            if "." in version_str:
+                git_sha = version_str.split(".")[0][1:]
+                date = version_str.split(".")[-1][1:]
+                return f"{__version__} (git sha: {git_sha}, date: {date})"
+            else:
+                git_sha = version_str[1:]
+                return f"{__version__} (git sha: {git_sha})"
+
+        package_dir = os.path.dirname(os.path.abspath(vllm_omni.__file__))
+        git_sha = run_and_read_all(run_lambda, f"git -C {package_dir} rev-parse --short HEAD")
+        if git_sha:
+            return f"{__version__} (git sha: {git_sha})"
+
+        return __version__
+    except ImportError:
+        return "N/A (vllm_omni not installed)"
+
+
+def summarize_vllm_build_flags():
+    # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
+    return "CUDA Archs: {}; ROCm: {}".format(
+        os.environ.get("TORCH_CUDA_ARCH_LIST", "Not Set"),
+        "Enabled" if os.environ.get("ROCM_HOME") else "Disabled",
+    )
+
+
+def get_gpu_topo(run_lambda):
+    output = None
+
+    if get_platform() == "linux":
+        output = run_and_read_all(run_lambda, "nvidia-smi topo -m")
+        if output is None:
+            output = run_and_read_all(run_lambda, "rocm-smi --showtopo")
+
+    return output
+
+
+def get_cpu_info(run_lambda):
+    rc, out, err = 0, "", ""
+    if get_platform() == "linux":
+        rc, out, err = run_lambda("lscpu")
+    elif get_platform() == "win32":
+        rc, out, err = run_lambda(
+            "wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \
+        CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE"
+        )
+    elif get_platform() == "darwin":
+        rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string")
+    cpu_info = "None"
+    if rc == 0:
+        cpu_info = out
+    else:
+        cpu_info = err
+    return cpu_info
+
+
+def get_platform():
+    if sys.platform.startswith("linux"):
+        return "linux"
+    elif sys.platform.startswith("win32"):
+        return "win32"
+    elif sys.platform.startswith("cygwin"):
+        return "cygwin"
+    elif sys.platform.startswith("darwin"):
+        return "darwin"
+    else:
+        return sys.platform
+
+
+def get_mac_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, "sw_vers -productVersion", r"(.*)")
+
+
+def get_windows_version(run_lambda):
+    system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
+    wmic_cmd = os.path.join(system_root, "System32", "Wbem", "wmic")
+    findstr_cmd = os.path.join(system_root, "System32", "findstr")
+    return run_and_read_all(run_lambda, "{} os get Caption | {} /v Caption".format(wmic_cmd, findstr_cmd))
+
+
+def get_lsb_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, "lsb_release -a", r"Description:\t(.*)")
+
+
+def check_release_file(run_lambda):
+    return run_and_parse_first_match(run_lambda, "cat /etc/*-release", r'PRETTY_NAME="(.*)"')
+
+
+def get_os(run_lambda):
+    from platform import machine
+
+    platform = get_platform()
+
+    if platform == "win32" or platform == "cygwin":
+        return get_windows_version(run_lambda)
+
+    if platform == "darwin":
+        version = get_mac_version(run_lambda)
+        if version is None:
+            return None
+        return "macOS {} ({})".format(version, machine())
+
+    if platform == "linux":
+        # Ubuntu/Debian based
+        desc = get_lsb_version(run_lambda)
+        if desc is not None:
+            return "{} ({})".format(desc, machine())
+
+        # Try reading /etc/*-release
+        desc = check_release_file(run_lambda)
+        if desc is not None:
+            return "{} ({})".format(desc, machine())
+
+        return "{} ({})".format(platform, machine())
+
+    # Unknown platform
+    return platform
+
+
+def get_python_platform():
+    import platform
+
+    return platform.platform()
+
+
+def get_libc_version():
+    import platform
+
+    if get_platform() != "linux":
+        return "N/A"
+    return "-".join(platform.libc_ver())
+
+
+def is_uv_venv():
+    if os.environ.get("UV"):
+        return True
+    pyvenv_cfg_path = os.path.join(sys.prefix, "pyvenv.cfg")
+    if os.path.exists(pyvenv_cfg_path):
+        with open(pyvenv_cfg_path, "r") as f:
+            return any(line.startswith("uv = ") for line in f)
+    return False
+
+
+def get_pip_packages(run_lambda, patterns=None):
+    """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages."""
+    if patterns is None:
+        patterns = DEFAULT_PIP_PATTERNS
+
+    def run_with_pip():
+        try:
+            import importlib.util
+
+            pip_spec = importlib.util.find_spec("pip")
+            pip_available = pip_spec is not None
+        except ImportError:
+            pip_available = False
+
+        if pip_available:
+            cmd = [sys.executable, "-mpip", "list", "--format=freeze"]
+        elif is_uv_venv():
+            print("uv is set")
+            cmd = ["uv", "pip", "list", "--format=freeze"]
+        else:
+            raise RuntimeError("Could not collect pip list output (pip or uv module not available)")
+
+        out = run_and_read_all(run_lambda, cmd)
+        return "\n".join(line for line in out.splitlines() if any(name in line for name in patterns))
+
+    pip_version = "pip3" if sys.version[0] == "3" else "pip"
+    out = run_with_pip()
+    return pip_version, out
+
+
+def get_cachingallocator_config():
+    ca_config = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")
+    return ca_config
+
+
+def get_cuda_module_loading_config():
+    if TORCH_AVAILABLE and torch.cuda.is_available():
+        torch.cuda.init()
+        config = os.environ.get("CUDA_MODULE_LOADING", "")
+        return config
+    else:
+        return "N/A"
+
+
+def is_xnnpack_available():
+    if TORCH_AVAILABLE:
+        import torch.backends.xnnpack
+
+        return str(torch.backends.xnnpack.enabled)  # type: ignore[attr-defined]
+    else:
+        return "N/A"
+
+
+def get_env_vars():
+    env_vars = ""
+    secret_terms = ("secret", "token", "api", "access", "password")
+    report_prefix = (
+        "TORCH",
+        "NCCL",
+        "PYTORCH",
+        "CUDA",
+        "CUBLAS",
+        "CUDNN",
+        "OMP_",
+        "MKL_",
+        "NVIDIA",
+    )
+    for k, v in os.environ.items():
+        if any(term in k.lower() for term in secret_terms):
+            continue
+        if k in environment_variables:
+            env_vars = env_vars + "{}={}".format(k, v) + "\n"
+        if k.startswith(report_prefix):
+            env_vars = env_vars + "{}={}".format(k, v) + "\n"
+
+    return env_vars
+
+
+def get_env_info():
+    run_lambda = run
+    pip_version, pip_list_output = get_pip_packages(run_lambda)
+
+    if TORCH_AVAILABLE:
+        version_str = torch.__version__
+        debug_mode_str = str(torch.version.debug)
+        cuda_available_str = str(torch.cuda.is_available())
+        cuda_version_str = torch.version.cuda
+        if not hasattr(torch.version, "hip") or torch.version.hip is None:  # cuda version
+            hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A"
+        else:  # HIP version
+
+            def get_version_or_na(cfg, prefix):
+                _lst = [s.rsplit(None, 1)[-1] for s in cfg if prefix in s]
+                return _lst[0] if _lst else "N/A"
+
+            cfg = torch._C._show_config().split("\n")
+            hip_runtime_version = get_version_or_na(cfg, "HIP Runtime")
+            miopen_runtime_version = get_version_or_na(cfg, "MIOpen")
+            cuda_version_str = "N/A"
+            hip_compiled_version = torch.version.hip
+    else:
+        version_str = debug_mode_str = cuda_available_str = cuda_version_str = "N/A"
+        hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A"
+
+    sys_version = sys.version.replace("\n", " ")
+
+    conda_packages = get_conda_packages(run_lambda)
+
+    rocm_version = get_rocm_version(run_lambda)
+    vllm_version = get_vllm_version()
+    vllm_omni_version = get_vllm_omni_version(run_lambda)
+    vllm_build_flags = summarize_vllm_build_flags()
+    gpu_topo = get_gpu_topo(run_lambda)
+
+    return SystemEnv(
+        torch_version=version_str,
+        is_debug_build=debug_mode_str,
+        python_version="{} ({}-bit runtime)".format(sys_version, sys.maxsize.bit_length() + 1),
+        python_platform=get_python_platform(),
+        is_cuda_available=cuda_available_str,
+        cuda_compiled_version=cuda_version_str,
+        cuda_runtime_version=get_running_cuda_version(run_lambda),
+        cuda_module_loading=get_cuda_module_loading_config(),
+        nvidia_gpu_models=get_gpu_info(run_lambda),
+        nvidia_driver_version=get_nvidia_driver_version(run_lambda),
+        cudnn_version=get_cudnn_version(run_lambda),
+        hip_compiled_version=hip_compiled_version,
+        hip_runtime_version=hip_runtime_version,
+        miopen_runtime_version=miopen_runtime_version,
+        pip_version=pip_version,
+        pip_packages=pip_list_output,
+        conda_packages=conda_packages,
+        os=get_os(run_lambda),
+        libc_version=get_libc_version(),
+        gcc_version=get_gcc_version(run_lambda),
+        clang_version=get_clang_version(run_lambda),
+        cmake_version=get_cmake_version(run_lambda),
+        caching_allocator_config=get_cachingallocator_config(),
+        is_xnnpack_available=is_xnnpack_available(),
+        cpu_info=get_cpu_info(run_lambda),
+        rocm_version=rocm_version,
+        vllm_version=vllm_version,
+        vllm_omni_version=vllm_omni_version,
+        vllm_build_flags=vllm_build_flags,
+        gpu_topo=gpu_topo,
+        env_vars=get_env_vars(),
+    )
+
+
+env_info_fmt = """
+==============================
+        System Info
+==============================
+OS                           : {os}
+GCC version                  : {gcc_version}
+Clang version                : {clang_version}
+CMake version                : {cmake_version}
+Libc version                 : {libc_version}
+
+==============================
+       PyTorch Info
+==============================
+PyTorch version              : {torch_version}
+Is debug build               : {is_debug_build}
+CUDA used to build PyTorch   : {cuda_compiled_version}
+ROCM used to build PyTorch   : {hip_compiled_version}
+
+==============================
+      Python Environment
+==============================
+Python version               : {python_version}
+Python platform              : {python_platform}
+
+==============================
+       CUDA / GPU Info
+==============================
+Is CUDA available            : {is_cuda_available}
+CUDA runtime version         : {cuda_runtime_version}
+CUDA_MODULE_LOADING set to   : {cuda_module_loading}
+GPU models and configuration : {nvidia_gpu_models}
+Nvidia driver version        : {nvidia_driver_version}
+cuDNN version                : {cudnn_version}
+HIP runtime version          : {hip_runtime_version}
+MIOpen runtime version       : {miopen_runtime_version}
+Is XNNPACK available         : {is_xnnpack_available}
+
+==============================
+          CPU Info
+==============================
+{cpu_info}
+
+==============================
+Versions of relevant libraries
+==============================
+{pip_packages}
+{conda_packages}
+""".strip()
+
+# both the above code and the following code use `strip()` to
+# remove leading/trailing whitespaces, so we need to add a newline
+# in between to separate the two sections
+env_info_fmt += "\n\n"
+
+env_info_fmt += """
+==============================
+         vLLM Info
+==============================
+ROCM Version                 : {rocm_version}
+vLLM Version                 : {vllm_version}
+vLLM-Omni Version            : {vllm_omni_version}
+vLLM Build Flags:
+  {vllm_build_flags}
+GPU Topology:
+  {gpu_topo}
+
+==============================
+     Environment Variables
+==============================
+{env_vars}
+""".strip()
+
+
+def pretty_str(envinfo):
+    def replace_nones(dct, replacement="Could not collect"):
+        for key in dct.keys():
+            if dct[key] is not None:
+                continue
+            dct[key] = replacement
+        return dct
+
+    def replace_bools(dct, true="Yes", false="No"):
+        for key in dct.keys():
+            if dct[key] is True:
+                dct[key] = true
+            elif dct[key] is False:
+                dct[key] = false
+        return dct
+
+    def prepend(text, tag="[prepend]"):
+        lines = text.split("\n")
+        updated_lines = [tag + line for line in lines]
+        return "\n".join(updated_lines)
+
+    def replace_if_empty(text, replacement="No relevant packages"):
+        if text is not None and len(text) == 0:
+            return replacement
+        return text
+
+    def maybe_start_on_next_line(string):
+        # If `string` is multiline, prepend a \n to it.
+        if string is not None and len(string.split("\n")) > 1:
+            return "\n{}\n".format(string)
+        return string
+
+    mutable_dict = envinfo._asdict()
+
+    # If nvidia_gpu_models is multiline, start on the next line
+    mutable_dict["nvidia_gpu_models"] = maybe_start_on_next_line(envinfo.nvidia_gpu_models)
+
+    # If the machine doesn't have CUDA, report some fields as 'No CUDA'
+    dynamic_cuda_fields = [
+        "cuda_runtime_version",
+        "nvidia_gpu_models",
+        "nvidia_driver_version",
+    ]
+    all_cuda_fields = dynamic_cuda_fields + ["cudnn_version"]
+    all_dynamic_cuda_fields_missing = all(mutable_dict[field] is None for field in dynamic_cuda_fields)
+    if TORCH_AVAILABLE and not torch.cuda.is_available() and all_dynamic_cuda_fields_missing:
+        for field in all_cuda_fields:
+            mutable_dict[field] = "No CUDA"
+        if envinfo.cuda_compiled_version is None:
+            mutable_dict["cuda_compiled_version"] = "None"
+
+    # Replace True with Yes, False with No
+    mutable_dict = replace_bools(mutable_dict)
+
+    # Replace all None objects with 'Could not collect'
+    mutable_dict = replace_nones(mutable_dict)
+
+    # If either of these are '', replace with 'No relevant packages'
+    mutable_dict["pip_packages"] = replace_if_empty(mutable_dict["pip_packages"])
+    mutable_dict["conda_packages"] = replace_if_empty(mutable_dict["conda_packages"])
+
+    # Tag conda and pip packages with a prefix
+    # If they were previously None, they'll show up as ie '[conda] Could not collect'
+    if mutable_dict["pip_packages"]:
+        mutable_dict["pip_packages"] = prepend(mutable_dict["pip_packages"], "[{}] ".format(envinfo.pip_version))
+    if mutable_dict["conda_packages"]:
+        mutable_dict["conda_packages"] = prepend(mutable_dict["conda_packages"], "[conda] ")
+    mutable_dict["cpu_info"] = envinfo.cpu_info
+    return env_info_fmt.format(**mutable_dict)
+
+
+def get_pretty_env_info():
+    return pretty_str(get_env_info())
+
+
+def main():
+    print("Collecting environment information...")
+    output = get_pretty_env_info()
+    print(output)
+
+    if TORCH_AVAILABLE and hasattr(torch, "utils") and hasattr(torch.utils, "_crash_handler"):
+        minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR
+        if sys.platform == "linux" and os.path.exists(minidump_dir):
+            dumps = [os.path.join(minidump_dir, dump) for dump in os.listdir(minidump_dir)]
+            latest = max(dumps, key=os.path.getctime)
+            ctime = os.path.getctime(latest)
+            creation_time = datetime.datetime.fromtimestamp(ctime).strftime("%Y-%m-%d %H:%M:%S")
+            msg = (
+                "\n*** Detected a minidump at {} created on {}, ".format(latest, creation_time)
+                + "if this is related to your bug please include it when you file a report ***"
+            )
+            print(msg, file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
--- a/docker/Dockerfile.ci
+++ b/docker/Dockerfile.ci
+ARG VLLM_BASE_IMAGE=vllm/vllm-openai
+ARG VLLM_BASE_TAG=v0.15.0
+FROM ${VLLM_BASE_IMAGE}:${VLLM_BASE_TAG}
+ARG APP_DIR=/workspace/vllm-omni
+WORKDIR ${APP_DIR}
+
+COPY . .
+
+# Install system dependencies
+RUN apt-get update && \
+    apt-get install -y ffmpeg && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install vllm-omni into the same uv-managed Python environment used by the base image.
+RUN uv pip install --python "$(python3 -c 'import sys; print(sys.executable)')" --no-cache-dir ".[dev]"
+
+RUN ln -sf /usr/bin/python3 /usr/bin/python
+
+ENTRYPOINT []
--- a/docker/Dockerfile.ci.npu
+++ b/docker/Dockerfile.ci.npu
+ARG VLLM_ASCEND_IMAGE=quay.nju.edu.cn/ascend/vllm-ascend
+ARG VLLM_ASCEND_TAG=v0.11.0rc2
+FROM ${VLLM_ASCEND_IMAGE}:${VLLM_ASCEND_TAG}
+
+ARG APP_DIR=/vllm-workspace/vllm-omni
+WORKDIR ${APP_DIR}
+
+COPY . .
+
+# Install vllm-omni with dev dependencies
+RUN pip install --no-cache-dir -e ".[dev]"
+
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+
+ENTRYPOINT []
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
+ARG BASE_IMAGE=vllm/vllm-openai-rocm:v0.15.0
+FROM ${BASE_IMAGE} AS final
+
+ARG COMMON_WORKDIR=/app
+
+WORKDIR ${COMMON_WORKDIR}
+
+# Step 1: Setup - Install system dependencies
+RUN apt-get update && \
+    apt-get install -y ffmpeg && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN mkdir -p ${COMMON_WORKDIR}/vllm-omni
+
+# Step 2: Copy vllm-omni code and install without uv
+COPY . ${COMMON_WORKDIR}/vllm-omni
+RUN cd ${COMMON_WORKDIR}/vllm-omni && uv pip install --python "$(python3 -c 'import sys; print(sys.executable)')" --no-cache-dir ".[dev]"
+
+# When we are installing onnxruntime-rocm, we need to uninstall the system-installed onnxruntime first.
+# These are the dependencies of Qwen3-TTS.
+RUN uv pip uninstall onnxruntime --system && uv pip install --no-cache-dir onnxruntime-rocm sox --system
+
+RUN ln -sf /usr/bin/python3 /usr/bin/python
+
+CMD ["/bin/bash"]
+
+ENTRYPOINT []
+
+#Set entrypoint for vllm-openai official images
+FROM final AS vllm-openai
+ENTRYPOINT ["vllm", "serve", "--omni"]
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
+nav:
+- Home: README.md
+- User Guide:
+  - Getting Started:
+    - getting_started/quickstart.md
+    - getting_started/installation/*
+  - Serving:
+    - OpenAI-Compatible API:
+      - Image Generation: serving/image_generation_api.md
+      - Image Edit: serving/image_edit_api.md
+  - Examples:
+    - examples/README.md
+    - Offline Inference:
+      - Image-To-Image: user_guide/examples/offline_inference/image_to_image.md
+      - Image-To-Video: user_guide/examples/offline_inference/image_to_video.md
+      - Qwen2.5-Omni: user_guide/examples/offline_inference/qwen2_5_omni.md
+      - Qwen3-Omni: user_guide/examples/offline_inference/qwen3_omni.md
+      - Qwen3-TTS Offline Inference: user_guide/examples/offline_inference/qwen3_tts.md
+      - Text-To-Image: user_guide/examples/offline_inference/text_to_image.md
+      - Text-To-Video: user_guide/examples/offline_inference/text_to_video.md
+    - Online Serving:
+      - Image-To-Image: user_guide/examples/online_serving/image_to_image.md
+      - Qwen2.5-Omni: user_guide/examples/online_serving/qwen2_5_omni.md
+      - Qwen3-Omni: user_guide/examples/online_serving/qwen3_omni.md
+      - Text-To-Image: user_guide/examples/online_serving/text_to_image.md
+  - General:
+    - usage/*
+  - Configuration:
+    - configuration/README.md
+    - configuration/*
+  - Models:
+    - models/supported_models.md
+  - Features:
+    - Sleep Mode: features/sleep_mode.md
+    - Diffusion Features:
+      - Overview: user_guide/diffusion_acceleration.md
+      - TeaCache: user_guide/diffusion/teacache.md
+      - Cache-DiT: user_guide/diffusion/cache_dit_acceleration.md
+      - Parallelism Acceleration: user_guide/diffusion/parallelism_acceleration.md
+      - CPU Offloading: user_guide/diffusion/cpu_offload_diffusion.md
+- Developer Guide:
+  - General:
+    - contributing/README.md
+    - glob: contributing/*
+      flatten_single_child_sections: true
+  - Model Implementation:
+    - contributing/model/README.md
+    - contributing/model/adding_omni_model.md
+    - contributing/model/adding_diffusion_model.md
+  - CI: contributing/ci
+  - Design Documents:
+    - design/index.md
+    - design/architecture_overview.md
+    - Feature Design:
+      - design/feature/disaggregated_inference.md
+      - design/feature/ray_based_execution.md
+    - Module Design:
+      - design/module/ar_module.md
+      - design/module/dit_module.md
+      - design/module/entrypoint_module.md
+  - Docs Guide: contributing/DOCS_GUIDE.md
+- API Reference:
+  - api/README.md
+  - api/vllm_omni
+- CLI Reference: cli
+- Community:
+  - community/*
+  - Slack: https://slack.vllm.ai
+  - Blog: https://blog.vllm.ai
+  - Forum: https://discuss.vllm.ai
--- a/docs/README.md
+++ b/docs/README.md
+---
+hide:
+  - navigation
+  - toc
+---
+
+# Welcome to vLLM-Omni
+
+<p align="center">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" src="./source/logos/vllm-omni-logo.png">
+    <img alt="vllm-omni" src="./source/logos/vllm-omni-logo.png" width=55%>
+  </picture>
+</p>
+<h3 align="center">
+Easy, fast, and cheap omni-modality model serving for everyone
+</h3>
+
+<p style="text-align:center">
+<script async defer src="https://buttons.github.io/buttons.js"></script>
+<a class="github-button" href="https://github.com/vllm-project/vllm-omni" data-show-count="true" data-size="large" aria-label="Star">Star</a>
+<a class="github-button" href="https://github.com/vllm-project/vllm-omni/subscription" data-show-count="true" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
+<a class="github-button" href="https://github.com/vllm-project/vllm-omni/fork" data-show-count="true" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
+</p>
+
+
+## About
+
+[vLLM](https://github.com/vllm-project/vllm) was originally designed to support large language models for text-based autoregressive generation tasks. vLLM-Omni is a framework that extends its support for omni-modality model inference and serving:
+
+- **Omni-modality**: Text, image, video, and audio data processing
+- **Non-autoregressive Architectures**: extend the AR support of vLLM to Diffusion Transformers (DiT) and other parallel generation models
+- **Heterogeneous outputs**: from traditional text generation to multimodal outputs
+
+<p align="center">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" src="./source/architecture/omni-modality-model-architecture.png">
+    <img alt="vllm-omni-arch" src="./source/architecture/omni-modality-model-architecture.png" width=55%>
+  </picture>
+</p>
+
+vLLM-Omni is fast with:
+
+- State-of-the-art AR support by leveraging efficient KV cache management from vLLM
+- Pipelined stage execution overlapping for high throughput performance
+- Fully disaggregation based on OmniConnector and dynamic resource allocation across stages
+
+vLLM-Omni is flexible and easy to use with:
+
+- Heterogeneous pipeline abstraction to manage complex model workflows
+- Seamless integration with popular Hugging Face models
+- Tensor, pipeline, data and expert parallelism support for distributed inference
+- Streaming outputs
+- OpenAI-compatible API server
+
+vLLM-Omni seamlessly supports most popular open-source models on HuggingFace, including:
+
+- Omni-modality models (e.g. Qwen2.5-Omni, Qwen3-Omni)
+- Multi-modality generation models (e.g. Qwen-Image)
+
+For more information, checkout the following:
+
+- [vllm-omni architecture design and recent roadmaps](https://docs.google.com/presentation/d/1qv4qMW1rKAqDREMXiUDLIgqqHQe7TDPj/edit?usp=sharing&ouid=110473603432222024453&rtpof=true&sd=true)
+- [vllm-omni announcement blogpost](https://blog.vllm.ai/2025/11/30/vllm-omni.html)
--- a/docs/api/README.md
+++ b/docs/api/README.md
+# Summary
+
+## Entry Points
+
+Main entry points for vLLM-Omni inference and serving.
+
+- [vllm_omni.entrypoints.async_omni.AsyncOmni][]
+- [vllm_omni.entrypoints.async_omni_diffusion.AsyncOmniDiffusion][]
+- [vllm_omni.entrypoints.async_omni_llm.AsyncOmniLLM][]
+- [vllm_omni.entrypoints.chat_utils.OmniAsyncMultiModalContentParser][]
+- [vllm_omni.entrypoints.chat_utils.OmniAsyncMultiModalItemTracker][]
+- [vllm_omni.entrypoints.chat_utils.parse_chat_messages_futures][]
+- [vllm_omni.entrypoints.cli.serve.OmniServeCommand][]
+- [vllm_omni.entrypoints.client_request_state.ClientRequestState][]
+- [vllm_omni.entrypoints.log_utils.OrchestratorMetrics][]
+- [vllm_omni.entrypoints.log_utils.StageRequestMetrics][]
+- [vllm_omni.entrypoints.log_utils.StageStats][]
+- [vllm_omni.entrypoints.omni.Omni][]
+- [vllm_omni.entrypoints.omni.OmniBase][]
+- [vllm_omni.entrypoints.omni_diffusion.OmniDiffusion][]
+- [vllm_omni.entrypoints.omni_llm.OmniLLM][]
+- [vllm_omni.entrypoints.omni_stage.OmniStage][]
+- [vllm_omni.entrypoints.stage_utils.OmniStageTaskType][]
+
+## Inputs
+
+Input data structures for multi-modal inputs.
+
+- [vllm_omni.inputs.data.OmniEmbedsPrompt][]
+- [vllm_omni.inputs.data.OmniTokenInputs][]
+- [vllm_omni.inputs.data.OmniTokensPrompt][]
+- [vllm_omni.inputs.parse.parse_singleton_prompt_omni][]
+- [vllm_omni.inputs.preprocess.OmniInputPreprocessor][]
+
+## Engine
+
+Engine classes for offline and online inference.
+
+- [vllm_omni.diffusion.diffusion_engine.DiffusionEngine][]
+- [vllm_omni.engine.AdditionalInformationEntry][]
+- [vllm_omni.engine.AdditionalInformationPayload][]
+- [vllm_omni.engine.OmniEngineCoreOutput][]
+- [vllm_omni.engine.OmniEngineCoreOutputs][]
+- [vllm_omni.engine.OmniEngineCoreRequest][]
+- [vllm_omni.engine.PromptEmbedsPayload][]
+- [vllm_omni.engine.arg_utils.AsyncOmniEngineArgs][]
+- [vllm_omni.engine.arg_utils.OmniEngineArgs][]
+- [vllm_omni.engine.input_processor.OmniInputProcessor][]
+- [vllm_omni.engine.output_processor.MultimodalOutputProcessor][]
+- [vllm_omni.engine.output_processor.OmniRequestState][]
+
+## Core
+
+Core scheduling and caching components.
+
+- [vllm_omni.core.sched.omni_ar_scheduler.KVCacheTransferData][]
+- [vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler][]
+- [vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler][]
+- [vllm_omni.core.sched.output.OmniCachedRequestData][]
+- [vllm_omni.core.sched.output.OmniNewRequestData][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.DistributedGroupResidualVectorQuantization][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.DistributedResidualVectorQuantization][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.EuclideanCodebook][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.VectorQuantization][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.vq.core_vq.preprocess][]
+
+## Configuration
+
+Configuration classes.
+
+- [vllm_omni.config.model.OmniModelConfig][]
+- [vllm_omni.diffusion.cache.teacache.config.TeaCacheConfig][]
+- [vllm_omni.distributed.omni_connectors.utils.config.ConnectorSpec][]
+- [vllm_omni.distributed.omni_connectors.utils.config.OmniTransferConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts.Qwen3TTSConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts.Qwen3TTSSpeakerEncoderConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts.Qwen3TTSTalkerCodePredictorConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts.Qwen3TTSTalkerConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_12hz.configuration_qwen3_tts_tokenizer_v2.Qwen3TTSTokenizerV2Config][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_12hz.configuration_qwen3_tts_tokenizer_v2.Qwen3TTSTokenizerV2DecoderConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1Config][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1DecoderBigVGANConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1DecoderConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1DecoderDiTConfig][]
+- [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1EncoderConfig][]
+
+## Workers
+
+Worker classes and model runners for distributed inference.
+
+- [vllm_omni.diffusion.worker.gpu_diffusion_model_runner.GPUDiffusionModelRunner][]
+- [vllm_omni.diffusion.worker.gpu_diffusion_worker.GPUDiffusionWorker][]
+- [vllm_omni.diffusion.worker.gpu_diffusion_worker.WorkerProc][]
+- [vllm_omni.diffusion.worker.npu.npu_worker.NPUWorker][]
+- [vllm_omni.diffusion.worker.npu.npu_worker.NPUWorkerProc][]
+- [vllm_omni.worker.gpu_ar_model_runner.ExecuteModelState][]
+- [vllm_omni.worker.gpu_ar_model_runner.GPUARModelRunner][]
+- [vllm_omni.worker.gpu_ar_worker.GPUARWorker][]
+- [vllm_omni.worker.gpu_generation_model_runner.GPUGenerationModelRunner][]
+- [vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker][]
+- [vllm_omni.worker.gpu_model_runner.OmniGPUModelRunner][]
+- [vllm_omni.worker.npu.npu_ar_model_runner.ExecuteModelState][]
+- [vllm_omni.worker.npu.npu_ar_model_runner.NPUARModelRunner][]
+- [vllm_omni.worker.npu.npu_ar_worker.NPUARWorker][]
+- [vllm_omni.worker.npu.npu_generation_model_runner.NPUGenerationModelRunner][]
+- [vllm_omni.worker.npu.npu_generation_worker.NPUGenerationWorker][]
+- [vllm_omni.worker.npu.npu_model_runner.OmniNPUModelRunner][]
--- a/docs/assets/WeChat.jpg
+++ b/docs/assets/WeChat.jpg
--- a/docs/cli/README.md
+++ b/docs/cli/README.md
+# vLLM-Omni CLI Guide
+
+The CLI for vLLM-Omni inherits from vllm with some additional arguments.
+
+## serve
+
+Starts the vLLM-Omni OpenAI Compatible API server.
+
+Start with a model:
+
+```bash
+vllm serve Qwen/Qwen2.5-Omni-7B --omni
+```
+
+Specify the port:
+
+```bash
+vllm serve Qwen/Qwen2.5-Omni-7B --omni --port 8091
+```
+
+If you have custom stage configs file, launch the server with command below
+```bash
+vllm serve Qwen/Qwen2.5-Omni-7B --omni --stage-configs-path /path/to/stage_configs_file
+```
+
+
+## bench
+
+Run benchmark tests for online serving throughput.
+Available Commands:
+
+```bash
+vllm bench serve --omni \
+    --model Qwen/Qwen2.5-Omni-7B \
+    --host server-host \
+    --port server-port \
+    --random-input-len 32 \
+    --random-output-len 4  \
+    --num-prompts  5
+```
+
+See [vllm bench serve](./bench/serve.md) for the full reference of all available arguments.
--- a/docs/cli/bench/serve.md
+++ b/docs/cli/bench/serve.md
+# vLLM-Omni Benchmark CLI Guide
+The vllm bench command launches the vLLM-Omni benchmark to evaluate the performance of multimodal models.
+
+## Notes
+We currently only support using the "openai-chat-omni" backend.
+
+## Basic Parameter Description
+You can use `vllm bench serve --omni --help=all` to get descriptions of all parameters. The commonly used parameters are described below:
+- `--omni`  
+  Enable Omni (multimodal) mode, supporting multimodal inputs and outputs such as images, videos, and audio.
+
+- `--backend`  
+  Specify the backend adapter as openai-chat-omni, using OpenAI Chat compatible API behavior as the protocol. Currently only openai-chat-omni is supported.
+
+- `--model`  
+  The model identifier to load, filled according to the models supported by vLLM-Omni.
+
+- `--endpoint`  
+  The API endpoint exposed externally, to which clients send their requests.
+
+- `--dataset-name`  
+  The name of the dataset used; random-mm indicates generating random multimodal inputs (images, videos, audio).
+
+- `--num-prompts`  
+  The total number of requests to send, an integer.
+
+- `--max-concurrency`  
+  "Maximum number of concurrent requests. This can be used "
+        "to help simulate an environment where a higher level component "
+        "is enforcing a maximum number of concurrent requests. While the "
+        "--request-rate argument controls the rate at which requests are "
+        "initiated, this argument will control how many are actually allowed "
+        "to execute at a time. This means that when used in combination, the "
+        "actual request rate may be lower than specified with --request-rate, "
+        "if the server is not processing requests fast enough to keep up."
+
+- `--request-rate`  
+  "Number of requests per second. If this is inf, "
+        "then all the requests are sent at time 0. "
+        "Otherwise, we use Poisson process or gamma distribution "
+        "to synthesize the request arrival times."
+
+- `--ignore-eos`  
+  "Set ignore_eos flag when sending the benchmark request."
+
+- `--metric-percentiles`  
+  Comma-separated list of percentiles for selected metrics. "
+        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
+        "Default value is \"99\"."
+        "Use \"--percentile-metrics\" to select metrics.
+
+- `--percentile-metrics`  
+        "Comma-separated list of selected metrics to report percentiles."
+                    "This argument specifies the metrics to report percentiles."
+                    'Allowed metric names are "ttft", "tpot", "itl", "e2el", "audio_ttfp", "audio_rtf". '
+
+- `--save-result`  
+Specify to save benchmark results to a json file
+
+- `--save-detailed`  
+"When saving the results, whether to include per request "
+        "information such as response, error, ttfs, tpots, etc."
+
+- `--result-dir`  
+ "Specify directory to save benchmark json results."
+        "If not specified, results are saved in the current directory."
+
+- `--result-filename`  
+"Specify the filename to save benchmark json results."
+        "If not specified, results will be saved in "
+        "{label}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
+
+- `--random-prefix-len`  
+  Number of fixed prefix tokens before the random context in a request.
+  The total input length is the sum of random-prefix-len and a random
+  context length sampled from [input_len * (1 - range_ratio),
+  input_len * (1 + range_ratio)].Only the random and random-mm modes
+  support this parameter.
+
+- `--random-input-len`  
+  Number of input tokens per request.Only the random and random-mm modes support this parameter.
+
+- `--random-output-len`  
+  Number of output tokens per request.Only the random and random-mm modes support this parameter.
+
+- `--random-range-ratio`  
+  Range ratio for sampling input/output length,
+  used only for random sampling. Must be in the range [0, 1) to define
+  a symmetric sampling range
+  [length * (1 - range_ratio), length * (1 + range_ratio)].
+  Only the random and random-mm modes support this parameter.
+
+- `--random-mm-base-items-per-request`  
+  Base number of multimodal items per request for random-mm.
+  Actual per-request count is sampled around this base using
+  --random-mm-num-mm-items-range-ratio.
+  Only the random-mm mode supports this parameter.
+
+- `--random-mm-limit-mm-per-prompt`  
+  Per-modality hard caps for items attached per request, e.g.
+  '{"image": 3, "video": 1, "audio": 1}'. The sampled per-request item
+  count is clamped to the sum of these limits. When a modality
+  reaches its cap, its buckets are excluded and probabilities are
+  renormalized.
+  Only the random-mm mode supports this parameter.
+
+- `--random-mm-num-mm-items-range-ratio`  
+  Range ratio r in [0, 1] for sampling items per request.
+  We sample uniformly from the closed integer range
+  [floor(n*(1-r)), ceil(n*(1+r))]
+  where n is the base items per request.
+  r=0 keeps it fixed; r=1 allows 0 items. The maximum is clamped
+  to the sum of per-modality limits from
+  --random-mm-limit-mm-per-prompt.
+  An error is raised if the computed min exceeds the max.
+  Only the random-mm mode supports this parameter.
+
+- `--random-mm-bucket-config`  
+  The bucket config is a dictionary mapping a multimodal item
+  sampling configuration to a probability.
+  Currently allows for 3 modalities: audio, images and videos.
+  A bucket key is a tuple of (height, width, num_frames)
+  The value is the probability of sampling that specific item.
+  Example:
+  --random-mm-bucket-config
+  "{(256, 256, 1): 0.5, (720, 1280, 16): 0.4, (0, 1, 5): 0.10}"
+  First item: images with resolution 256x256 w.p. 0.5
+  Second item: videos with resolution 720x1280 and 16 frames
+  Third item: audios with 1s duration and 5 channels w.p. 0.1
+  OBS.: If the probabilities do not sum to 1, they are normalized.
+  Only the random-mm mode supports this parameter
+
+## Usage Examples
+
+### Online Benchmark
+<details class="admonition abstract" markdown="1">
+<summary>Show more</summary>
+
+First start serving your model:
+
+```bash
+vllm serve Qwen/Qwen2.5-Omni-7B --omni
+```
+
+Then run the benchmarking for sharegpt:
+
+```bash
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+vllm bench serve \
+  --omni \
+  --port 43845 \
+  --model /home/models/Qwen/Qwen3-Omni-30B-A3B-Instruct \
+  --endpoint /v1/chat/completions \
+  --backend openai-chat-omni \
+  --num-prompts 2 \
+  --dataset-name sharegpt \
+  --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \
+  --percentile-metrics ttft,tpot,itl,e2el
+```
+If successful, you will see the following output:
+```text
+============ Serving Benchmark Result ============
+Successful requests:                     2
+Failed requests:                         0
+Benchmark duration (s):                  81.63
+Request throughput (req/s):              0.02
+Peak concurrent requests:                2.00
+----------------End-to-end Latency----------------
+Mean E2EL (ms):                          56966.13
+Median E2EL (ms):                        56966.13
+P99 E2EL (ms):                           81016.80
+================== Text Result ===================
+Total input tokens:                      36
+Total generated tokens:                  5926
+Output token throughput (tok/s):         72.60
+Peak output token throughput (tok/s):    103.00
+Peak concurrent requests:                2.00
+Total Token throughput (tok/s):          73.04
+---------------Time to First Token----------------
+Mean TTFT (ms):                          124.76
+Median TTFT (ms):                        124.76
+P99 TTFT (ms):                           156.10
+-----Time per Output Token (excl. 1st token)------
+Mean TPOT (ms):                          481.30
+Median TPOT (ms):                        481.30
+P99 TPOT (ms):                           947.55
+---------------Inter-token Latency----------------
+Mean ITL (ms):                           25.11
+Median ITL (ms):                         0.33
+P99 ITL (ms):                            25.17
+================== Audio Result ==================
+Total audio duration generated(s):       3.95
+Total audio frames generated:            94890
+Audio throughput(audio duration/s):      0.05
+==================================================
+```
+
+Or run the benchmarking for random:
+
+```bash
+vllm bench serve \
+  --omni \
+  --port 43845 \
+  --endpoint /v1/chat/completions \
+  --backend openai-chat-omni \
+  --model /home/models/Qwen/Qwen3-Omni-30B-A3B-Instruct \
+  --dataset-name random \
+  --num-prompts 2 \
+  --random-prefix-len 5 \
+  --random-input-len 10 \
+  --random-output-len 100 \
+  --percentile-metrics ttft,tpot,itl,e2el,audio_ttfp,audio_rtf \
+  --ignore-eos
+```
+
+If successful, you will see the following output:
+
+```text
+============ Serving Benchmark Result ============
+Successful requests:                     2
+Failed requests:                         0
+Benchmark duration (s):                  24.35
+Request throughput (req/s):              0.08
+Peak concurrent requests:                2.00
+----------------End-to-end Latency----------------
+Mean E2EL (ms):                          22576.23
+Median E2EL (ms):                        22576.23
+P99 E2EL (ms):                           24205.72
+================== Text Result ===================
+Total input tokens:                      30
+Total generated tokens:                  8973
+Output token throughput (tok/s):         368.52
+Peak output token throughput (tok/s):    81.00
+Peak concurrent requests:                2.00
+Total Token throughput (tok/s):          369.76
+---------------Time to First Token----------------
+Mean TTFT (ms):                          125.16
+Median TTFT (ms):                        125.16
+P99 TTFT (ms):                           155.88
+-----Time per Output Token (excl. 1st token)------
+Mean TPOT (ms):                          5.01
+Median TPOT (ms):                        5.01
+P99 TPOT (ms):                           5.42
+---------------Inter-token Latency----------------
+Mean ITL (ms):                           34.15
+Median ITL (ms):                         0.01
+P99 ITL (ms):                            376.19
+================== Audio Result ==================
+Total audio duration generated(s):       3.95
+Total audio frames generated:            94890
+Audio throughput(audio duration/s):      0.16
+---------------Time to First Packet---------------
+Mean AUDIO_TTFP (ms):                    11756.89
+Median AUDIO_TTFP (ms):                  11756.89
+P99 AUDIO_TTFP (ms):                     20854.25
+-----------------Real Time Factor-----------------
+Mean AUDIO_RTF:                          3.75
+Median AUDIO_RTF:                        3.75
+P99 AUDIO_RTF:                           7.39
+==================================================
+```
+Notes:
+We use (audio generation time - first packet latency) / audio duration to calculate RTF.
+
+</details>
+
+### Multi-Modal Benchmark
+
+<details class="admonition abstract" markdown="1">
+<summary>Show more</summary>
+
+Benchmark the performance of multi-modal requests in vLLM-Omni.
+
+Generate synthetic image、video、audio inputs alongside random text prompts to stress-test vision models without external datasets.
+
+Notes:
+
+- Works only with online benchmark via the OpenAI backend (`--backend openai-chat-omni`) and endpoint `/v1/chat/completions`.
+
+Start the server (example):
+
+```bash
+vllm serve Qwen/Qwen2.5-Omni-7B --omni
+```
+
+It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`.
+
+Then run the benchmarking script:
+```bash
+vllm bench serve \
+    --omni \
+  --dataset-name random-mm \
+  --port 40849 \
+  --model /home/models/Qwen/Qwen3-Omni-30B-A3B-Instruct \
+  --endpoint /v1/chat/completions \
+  --backend openai-chat-omni \
+  --request-rate 1 \
+  --num-prompts 1 \
+  --random-input-len 10 \
+  --random-range-ratio 0.0 \
+  --random-mm-base-items-per-request 2 \
+  --random-mm-num-mm-items-range-ratio 0 \
+  --random-mm-limit-mm-per-prompt '{"image":1,"video":1, "audio": 1}' \
+  --random-mm-bucket-config '{"(32, 32, 1)": 0.5, "(0, 1, 1)": 0.1, "(32, 32, 2)":0.4}' \
+  --ignore-eos \
+  --percentile-metrics ttft,tpot,itl \
+  --random-output-len 2 \
+  --extra_body '{"modalities": ["text"]}'
+```
+
+If successful, you will see the following output:
+
+```text
+============ Serving Benchmark Result ============
+Successful requests:                     1
+Failed requests:                         0
+Request rate configured (RPS):           1.00
+Benchmark duration (s):                  1.21
+Request throughput (req/s):              0.83
+Peak concurrent requests:                1.00
+================== Text Result ===================
+Total input tokens:                      10
+Total generated tokens:                  3
+Output token throughput (tok/s):         2.49
+Peak output token throughput (tok/s):    3.00
+Peak concurrent requests:                1.00
+Total Token throughput (tok/s):          10.77
+---------------Time to First Token----------------
+Mean TTFT (ms):                          179.74
+Median TTFT (ms):                        179.74
+P99 TTFT (ms):                           179.74
+-----Time per Output Token (excl. 1st token)------
+Mean TPOT (ms):                          12.76
+Median TPOT (ms):                        12.76
+P99 TPOT (ms):                           12.76
+---------------Inter-token Latency----------------
+Mean ITL (ms):                           12.76
+Median ITL (ms):                         12.76
+P99 ITL (ms):                            25.24
+================== Audio Result ==================
+Total audio duration generated(s):       0.00
+Total audio frames generated:            0
+Audio throughput(audio duration/s):      0.00
+==================================================
+```
+
+Behavioral notes:
+
+- If the requested base item count cannot be satisfied under the provided per-prompt limits, the tool raises an error rather than silently clamping.
+
+How sampling works:
+
+- Determine per-request item count k by sampling uniformly from the integer range defined by `--random-mm-base-items-per-request` and `--random-mm-num-mm-items-range-ratio`, then clamp k to at most the sum of per-modality limits.
+- For each of the k items, sample a bucket (H, W, T) according to the normalized probabilities in `--random-mm-bucket-config`, while tracking how many items of each modality have been added.
+- If a modality (e.g., image) reaches its limit from `--random-mm-limit-mm-per-prompt`, all buckets of that modality are excluded and the remaining bucket probabilities are renormalized before continuing.
+This should be seen as an edge case, and if this behavior can be avoided by setting `--random-mm-limit-mm-per-prompt` to a large number. Note that this might result in errors due to engine config `--limit-mm-per-prompt`.
+- The resulting request contains synthetic image data in `multi_modal_data` (OpenAI Chat format). When `random-mm` is used with the OpenAI Chat backend, prompts remain text and MM content is attached via `multi_modal_data`.
+</details>
--- a/docs/community/contact_us.md
+++ b/docs/community/contact_us.md
+# Contact Us
+
+- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm-omni/issues)
+- For coordinating contributions and development and discussing with other users and developers, please join `sig-omni` channel in our [Slack](https://slack.vllm.ai/) or use the [vLLM Forum](https://discuss.vllm.ai/)
+- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm-omni/security/advisories) feature
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
+# Meetups
--- a/docs/community/volunteers.md
+++ b/docs/community/volunteers.md
+# Volunteers for Bugfix and CI
+
+We encourage you to check current docs and [issues](https://github.com/vllm-project/vllm-omni/issues) to find possible solutions for your questions. If non of these can solve it, please propose an issue to describe your questions about bug or CI problems for developing.
+
+If you have urgent need for locating and solving bugfix or CI problems, please find community volunteers below.
+
+| Dec 4-Dec 12 | Dec 15-Dec 19 | Dec 22-Dec 26 | Dec 29- Jan 2, 2026| Jan 5-Jan 9 | Jan 12-Jan 16 |
+|----------|----------|----------|----------|----------|----------|
+| <a href="https://github.com/congw729">Conw729</a> | <a href="https://github.com/yinpeiqi">yinpeiqi</a> | <a href="https://github.com/tzhouam">tzhouam</a> | <a href="https://github.com/SamitHuang">SamitHuang</a> | <a href="https://github.com/gcanlin">gcanlin</a> | <a href="https://github.com/natureofnature">natureofnature</a> |
+| <a href="https://github.com/david6666666">david6666666</a> | <a href="https://github.com/R2-Y">R2-Y</a> | <a href="https://github.com/hsliuustc0106">hsliuustc0106</a> | <a href="https://github.com/Gaohan123">Gaohan123</a> | <a href="https://github.com/ZJY0516">ZJY0516</a> | <a href="https://github.com/qibaoyuan">qibaoyuan</a> |
+
+We kindly welcome more contributors to fix bugs and contribute new features!
--- a/docs/configuration/README.md
+++ b/docs/configuration/README.md
+# Configuration Options
+
+This section lists the most common options for running vLLM-Omni.
+
+For options within a vLLM Engine. Please refer to [vLLM Configuration](https://docs.vllm.ai/en/v0.14.0/configuration/index.html)
+
+Currently, the main options are maintained by stage configs for each model.
+
+For specific example, please refer to [Qwen2.5-omni stage config](stage_configs/qwen2_5_omni.yaml)
+
+For introduction, please check [Introduction for stage config](./stage_configs.md)
+
+## Memory Configuration
+
+- **[GPU Memory Calculation and Configuration](./gpu_memory_utilization.md)** - Guide on how to calculate memory requirements and set up `gpu_memory_utilization` for optimal performance
+
+## Optimization Features
+
+- **[TeaCache Configuration](../user_guide/diffusion/teacache.md)** - Enable TeaCache adaptive caching for DiT models to achieve 1.5x-2.0x speedup with minimal quality loss
+- **[Cache-DiT Configuration](../user_guide/diffusion/cache_dit_acceleration.md)** - Enable Cache-DiT as cache acceleration backends for DiT models
+- **[Parallelism Configuration](../user_guide/diffusion/parallelism_acceleration.md)** - Enable parallelism (e.g., sequence parallelism) for for DiT models
--- a/docs/configuration/gpu_memory_utilization.md
+++ b/docs/configuration/gpu_memory_utilization.md
+# GPU Memory Calculation and Configuration
+
+This guide explains how to calculate GPU memory requirements and properly configure `gpu_memory_utilization` for vLLM-Omni stages.
+
+## Overview
+
+`gpu_memory_utilization` is a critical parameter that controls how much GPU memory each stage can use. It's specified as a fraction between 0.0 and 1.0, where:
+- `0.8` means 80% of the GPU's total memory
+- `1.0` means 100% of the GPU's total memory (not recommended, leaves no buffer)
+
+## How Memory is Calculated
+
+### Memory Allocation Formula
+
+For each stage, vLLM-Omni calculates the requested memory as:
+
+```
+requested_memory = total_gpu_memory × gpu_memory_utilization
+```
+
+The system checks that:
+```
+free_memory ≥ requested_memory
+```
+
+If this condition is not met, the stage will fail to initialize with an error message showing the memory requirements.
+
+### Memory Components
+
+The total memory used by a stage includes:
+
+1. **Model Weights**: The size of the model parameters loaded on the GPU
+2. **KV Cache**: Memory for storing key-value cache during generation
+3. **Activation Memory**: Temporary memory for intermediate computations
+4. **System Overhead**: Memory used by CUDA, PyTorch, and other system components
+5. **Non-Torch Memory**: Memory allocated outside of PyTorch (e.g., CUDA graphs)
+
+### Example Calculation
+
+For a GPU with 80GB total memory:
+- `gpu_memory_utilization: 0.8` → 64GB available for the stage
+- `gpu_memory_utilization: 0.6` → 48GB available for the stage
+- `gpu_memory_utilization: 0.15` → 12GB available for the stage
+
+## Setting Up `gpu_memory_utilization`
+
+### Step 1: Determine GPU Memory
+
+First, check your GPU's total memory:
+
+```bash
+# Using nvidia-smi
+nvidia-smi --query-gpu=memory.total --format=csv
+
+# Or using Python
+python -c "import torch; print(f'{torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')"
+```
+
+### Step 2: Estimate Model Memory Requirements
+
+#### For Autoregressive (AR) Stages
+
+AR stages typically need more memory due to:
+- Large model weights
+- KV cache for attention
+- Activation buffers
+
+#### For Diffusion/Generation Stages
+
+Diffusion stages (like code2wav) typically need less memory:
+- Smaller model components
+- Different memory access patterns
+
+**Typical values:**
+- `0.1 - 0.3` for most diffusion stages
+
+### Step 3: Consider Multi-Stage Scenarios
+
+When multiple stages share the same GPU, you must ensure the sum of their `gpu_memory_utilization` values doesn't exceed 1.0.
+
+**Example: Two stages on GPU 0**
+```yaml
+stage_args:
+  - stage_id: 0
+    runtime:
+      devices: "0"
+    engine_args:
+      gpu_memory_utilization: 0.6  # Uses 60% of GPU 0
+
+  - stage_id: 1
+    runtime:
+      devices: "0"
+    engine_args:
+      gpu_memory_utilization: 0.3  # Uses 30% of GPU 0
+      # Total: 90% of GPU 0 (safe, leaves 10% buffer)
+```
+
+**Important:** If stages run on different GPUs, each can use up to 1.0 independently.
+
+### Step 4: Account for Tensor Parallelism
+
+When using `tensor_parallel_size > 1`, the model is split across multiple GPUs, so each GPU needs less memory.
+
+**Example: 2-way tensor parallelism**
+```yaml
+stage_args:
+  - stage_id: 0
+    runtime:
+      devices: "0,1"  # Uses both GPUs
+    engine_args:
+      tensor_parallel_size: 2
+      gpu_memory_utilization: 0.6  # 60% per GPU
+      # Model is split, so each GPU uses ~30% of model memory
+```
+
+## Examples
+
+### Qwen3-Omni-MoE on 2x H100-80GB
+
+```yaml
+stage_args:
+  - stage_id: 0  # Thinker stage with TP=2
+    runtime:
+      devices: "0,1"
+    engine_args:
+      tensor_parallel_size: 2
+      gpu_memory_utilization: 0.6  # 48GB per GPU
+
+  - stage_id: 1  # Talker stage
+    runtime:
+      devices: "1"
+    engine_args:
+      gpu_memory_utilization: 0.3  # 24GB on GPU 1
+
+  - stage_id: 2  # Code2Wav stage
+    runtime:
+      devices: "0"
+    engine_args:
+      gpu_memory_utilization: 0.1  # 8GB on GPU 0
+```
+**Note:** In this configuration, stages 0 and 2 share GPU 0, but they run at different times in the pipeline, so their memory usage doesn't overlap.
+
+## Troubleshooting
+
+### Error: "Free memory is less than desired GPU memory utilization"
+
+This means the GPU doesn't have enough free memory when the stage starts.
+
+**Solutions:**
+1. Free up memory by closing other processes
+2. Reduce `gpu_memory_utilization` for this stage
+3. Use a GPU with more memory
+4. Move the stage to a different GPU
+
+### Error: OOM during inference
+
+The stage initialized but ran out of memory during processing.
+
+**Solutions:**
+1. Reduce `max_num_batched_tokens`
+2. Reduce `max_batch_size` in runtime config
+3. Lower `gpu_memory_utilization` slightly
+4. Enable quantization if supported
+
+### Memory Not Fully Utilized
+
+If you see low memory usage, you can:
+1. Increase `gpu_memory_utilization` to allow larger KV cache
+2. Increase `max_num_batched_tokens` for better batching
+3. Check if other stages are limiting throughput
+
+## Useful formula for Memory Calculation
+
+### KV Cache Memory
+
+The KV cache size depends on:
+- Number of sequences in batch
+- Sequence length (prompt + generation)
+- Model hidden size
+- Number of attention heads
+- Number of layers
+
+approximate Formula:
+```
+kv_cache_memory ≈ batch_size × seq_len × hidden_size × num_layers × 2 × dtype_size
+```
+2 for k & v
+
+### Model Weight Memory
+
+```
+model_memory ≈ num_parameters × dtype_size
+```
+
+For example:
+- 7B parameters in FP16: ~14GB
+- 7B parameters in FP32: ~28GB
+- 7B parameters in INT8: ~7GB
+
+### Activation Memory
+
+Activation memory is typically smaller but varies with:
+- Batch size
+- Sequence length
+- Model architecture
+
+It's usually 10-30% of model weight memory during inference.
--- a/docs/configuration/stage_configs.md
+++ b/docs/configuration/stage_configs.md
+# Stage configs for vLLM-Omni
+
+In vLLM-Omni, the target model is separated into multiple stages, which are processed by different LLMEngines, DiffusionEngines or other types of engines. Depending on different types of stages, such as Autoregressive (AR) stage or Diffusion transformer (DiT) stage, each can choose corresponding schedulers, model workers to load with the Engines in a plug-in fashion.
+
+!!! note
+    Default stage config YAMLs (for example, `vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml` and `vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml`) are bundled and loaded automatically when `stage_configs_path` is not provided. They have been verified to work on 1xH100 for Qwen2.5-Omni and 2xH100 for Qwen3-Omni.
+
+Therefore, as a core part of vLLM-Omni, the stage configs for a model have several main functions:
+
+- Claim partition of stages and their corresponding class implementation in `model_executor/models`.
+- The disaggregated configuration for each stage and the communication topology among them.
+- Engine arguments for each engine within the stage.
+- Input and output dependencies for each stage.
+- Default input parameters.
+
+If users want to modify some part of it. The custom stage_configs file can be input as input argument in both online and offline. Just like examples below:
+
+For offline (Assume necessary dependencies have ben imported):
+```python
+model_name = "Qwen/Qwen2.5-Omni-7B"
+omni_llm = OmniLLM(model=model_name, stage_configs_path="/path/to/custom_stage_configs.yaml")
+```
+
+For online serving:
+```bash
+vllm serve Qwen/Qwen2.5-Omni-7B --omni --port 8091 --stage-configs-path /path/to/stage_configs_file
+```
+!!! important
+    We are actively iterating on the definition of stage configs, and we welcome all feedbacks from both community users and developers to help us shape the development!
+
+Below is a specific example of stage_configs.yaml in Qwen2.5-omni.
+```python
+# stage config for running qwen2.5-omni with architecture of OmniLLM.
+stage_args:
+  - stage_id: 0 # mark the unique id for each stage
+    runtime: # The disaggregated configuration
+      process: true  # Run this stage in a separate process
+      devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
+      max_batch_size: 1 # the batch_size for offline inference
+    engine_args: # Engine arguments for a certain engine
+      model_stage: thinker
+      model_arch: Qwen2_5OmniForConditionalGeneration # The model implementation registered in model_executor/models/registry.py
+      worker_type: ar # The specific worker used
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler # The specific scehduler used
+      gpu_memory_utilization: 0.8 # The gpu memory allocation for the stage within a single chip
+      enforce_eager: true  # Now we only support eager mode
+      trust_remote_code: true # Needed by huggingface config parsing
+      engine_output_type: latent  # It claims that the stage will input latent hiddenstates besides token ids
+      enable_prefix_caching: false # For request with hiddenstates output, the prefix caching is not supported now
+    is_comprehension: true # If the stage is a text or multimodal comprehension module. If it is, the AsyncOmni will use its tokenizer as default
+    final_output: true # If the stage has output as part of final outputs. If it is false, which means that the stage only works as a intermediate role.
+    final_output_type: text # What is the final output type. It can be text and audio now.
+    default_sampling_params: # sampling parameters for the stage. Their meaning aligns with vLLM.
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 2048
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
+  - stage_id: 1
+    runtime:
+      process: true
+      devices: "1"
+      max_batch_size: 3
+    engine_args:
+      model_stage: talker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.8
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: latent
+    engine_input_source: [0]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker
+    default_sampling_params:
+      temperature: 0.9
+      top_p: 0.8
+      top_k: 40
+      max_tokens: 2048
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.05
+      stop_token_ids: [8294]
+  - stage_id: 2
+    runtime:
+      process: true
+      devices: "0"            # Example: use a different GPU than the previous stage; use "0" if single GPU
+      max_batch_size: 1
+    engine_args:
+      model_stage: code2wav
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_type: generation
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      gpu_memory_utilization: 0.15
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: audio
+    engine_input_source: [1]
+    final_output: true
+    final_output_type: audio
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 2048
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
+
+# Top-level runtime config (concise): default windows and stage edges
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1             # Simplified: trigger downstream only after full upstream completion
+    max_inflight: 1             # Simplified: process serially within each stage
+  edges:
+    - from: 0                   # thinker → talker: trigger only after receiving full input (-1)
+      to: 1
+      window_size: -1
+    - from: 1                   # talker → code2wav: trigger only after receiving full input (-1)
+      to: 2
+      window_size: -1
+
+```
+
+## Stage Configuration Arguments
+
+Each stage in the `stage_args` list contains the following configuration options:
+
+### `stage_id`
+
+A unique identifier for each stage in the multi-stage pipeline. Stages are numbered sequentially starting from 0, and this ID is used to reference stages in inter-stage dependencies (e.g., `engine_input_source`).
+
+### `runtime`
+
+Configuration for disaggregated execution of the stage, controlling how the stage is deployed and executed.
+
+#### `runtime.process`
+
+Whether to run this stage in a separate process. When set to `true`, the stage will be executed in an isolated process, enabling better resource isolation and parallel execution across different stages. This is essential for multi-GPU deployments where different stages run on different devices.
+
+Default: `true`
+
+#### `runtime.devices`
+
+Visible devices for this stage, specified as a string. This controls which GPU devices are available to the stage process, similar to setting `CUDA_VISIBLE_DEVICES` or using `torch.cuda.set_device()`. For example, `"0"` uses GPU 0, `"1"` uses GPU 1, and `"0,1"` makes both GPUs 0 and 1 visible.
+
+Default: `"0"`
+
+#### `runtime.max_batch_size`
+
+The maximum batch size for offline inference in this stage. This limits how many sequences can be processed together in a single batch during offline inference operations.
+
+Default: `1`
+
+### `engine_args`
+
+Engine arguments for configuring the LLM engine, diffusion engine, or other engine types used by this stage.
+
+#### `engine_args.model_stage`
+
+The name identifier for this model stage within the multi-stage architecture. This is used internally to distinguish different stages of the same model (e.g., "thinker", "talker", "code2wav" in Qwen2.5-Omni).
+
+#### `engine_args.model_arch`
+
+The model architecture class name that is registered in `model_executor/models/registry.py`. This specifies which model implementation to use for this stage. The class must be registered in the model registry for vLLM-Omni to locate and instantiate it.
+
+#### `engine_args.worker_cls`
+
+The specific worker class to use for this stage. This determines how the model computations are executed. Examples include `vllm_omni.worker.gpu_ar_worker.GPUARWorker` for autoregressive stages and `vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker` for diffusion-based stages.
+
+#### `engine_args.scheduler_cls`
+
+The scheduler class to use for this stage. The scheduler manages request queuing, batching, and execution order. Examples include `vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler` for standard stages and `vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler` for diffusion stages.
+
+#### `engine_args.gpu_memory_utilization`
+
+The fraction of GPU memory to allocate for this stage within a single GPU chip. This is a value between 0.0 and 1.0, where 0.8 means 80% of the GPU memory will be used by this stage. This allows fine-grained control over memory allocation when multiple stages share the same GPU or when reserving memory for other operations.
+
+Default: `0.8`
+
+!!! tip "Memory Configuration Guide"
+    For detailed information on how to calculate memory requirements and properly configure `gpu_memory_utilization`, see the [GPU Memory Calculation and Configuration Guide](./gpu_memory_utilization.md).
+
+#### `engine_args.enforce_eager`
+
+Whether to enforce eager execution mode. When set to `true`, the engine will run in eager mode without using CUDA graphs or other compilation optimizations. Currently, vLLM-Omni only supports eager mode.
+
+Default: `true`
+
+#### `engine_args.trust_remote_code`
+
+Whether to trust remote code when loading models from Hugging Face. This is required for models that use custom code in their configuration files. Set to `true` when loading models that require custom model implementations.
+
+Default: `true`
+
+#### `engine_args.engine_output_type`
+
+Specifies the type of output produced by this stage's engine. This determines what kind of data flows to downstream stages. Possible values include `latent` (hidden states), `text` (tokenized text), and `audio` (audio waveforms). When set to `latent`, the stage outputs latent hidden states in addition to token IDs, which are consumed by downstream stages.
+
+Default: `latent`
+
+#### `engine_args.enable_prefix_caching`
+
+Whether to enable prefix caching for this stage. Prefix caching can improve performance by caching KV cache for common prompt prefixes. However, for requests that output hidden states (when `engine_output_type` is `latent`), prefix caching is not currently supported and should be set to `false`.
+
+Default: `false`
+
+### `is_comprehension`
+
+Whether this stage is a text or multimodal comprehension module. When set to `true`, the stage acts as a comprehension module that processes input text or multimodal content. If this is the first comprehension stage, `AsyncOmni` will use its tokenizer as the default tokenizer for the entire pipeline.
+
+Default: `true`
+
+### `final_output`
+
+Whether this stage produces output that is part of the final outputs returned to the user. When set to `false`, the stage only works as an intermediate stage, processing data that flows to downstream stages but not contributing directly to the final response.
+
+Default: `true`
+
+### `final_output_type`
+
+The type of final output produced by this stage. This specifies what format the output will be in when returned to the user. Currently supported values are `text` (for text generation) and `audio` (for audio generation).
+
+Default: `text`
+
+### `default_sampling_params`
+
+Default sampling parameters for this stage. These parameters control the generation behavior and align with vLLM's sampling parameter semantics. These defaults are used when no explicit sampling parameters are provided in the request.
+
+#### `default_sampling_params.temperature`
+
+Sampling temperature for controlling randomness. Lower values (e.g., 0.0) make the output more deterministic and focused, while higher values increase randomness.
+
+Default: `0.0`
+
+#### `default_sampling_params.top_p`
+
+Nucleus sampling parameter. Only tokens with cumulative probability mass up to `top_p` are considered. This helps filter out low-probability tokens while maintaining diversity.
+
+Default: `1.0`
+
+#### `default_sampling_params.top_k`
+
+Top-k sampling parameter. Only the top `k` most likely tokens are considered. Set to `-1` to disable top-k filtering and consider all tokens.
+
+Default: `-1`
+
+#### `default_sampling_params.max_tokens`
+
+Maximum number of tokens to generate in this stage. This limits the length of the output sequence.
+
+Default: `2048`
+
+#### `default_sampling_params.seed`
+
+Random seed for reproducible generation. When set, the random number generator will be initialized with this seed to ensure consistent outputs across runs.
+
+Default: `42`
+
+#### `default_sampling_params.detokenize`
+
+Whether to detokenize the output tokens into text. When set to `true`, token IDs are converted back to readable text strings.
+
+Default: `True`
+
+#### `default_sampling_params.repetition_penalty`
+
+Penalty applied to tokens that have already appeared in the generated sequence. Values greater than 1.0 discourage repetition, while values less than 1.0 encourage it. A value of 1.0 applies no penalty.
+
+Default: `1.1`
--- a/docs/configuration/stage_configs/qwen2_5_omni.yaml
+++ b/docs/configuration/stage_configs/qwen2_5_omni.yaml
+# stage config for running qwen2.5-omni with architecture of OmniLLM.
+stage_args:
+  - stage_id: 0
+    runtime:
+      process: true            # Run this stage in a separate process
+      devices: "0"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.8
+      enforce_eager: true  # Now we only support eager mode
+      trust_remote_code: true
+      engine_output_type: latent
+      enable_prefix_caching: false
+    is_comprehension: true
+    final_output: true
+    final_output_type: text
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 2048
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
+  - stage_id: 1
+    runtime:
+      process: true
+      devices: "1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: talker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_type: ar
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.8
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: latent
+    engine_input_source: [0]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker
+    default_sampling_params:
+      temperature: 0.9
+      top_p: 0.8
+      top_k: 40
+      max_tokens: 2048
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.05
+      stop_token_ids: [8294]
+  - stage_id: 2
+    runtime:
+      process: true
+      devices: "0"            # Example: use a different GPU than the previous stage; use "0" if single GPU
+      max_batch_size: 1
+    engine_args:
+      model_stage: code2wav
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_type: generation
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      gpu_memory_utilization: 0.15
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: audio
+    engine_input_source: [1]
+    final_output: true
+    final_output_type: audio
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 2048
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
+
+# Top-level runtime config (concise): default windows and stage edges
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1             # Simplified: trigger downstream only after full upstream completion
+    max_inflight: 1             # Simplified: process serially within each stage
+  edges:
+    - from: 0                   # thinker → talker: trigger only after receiving full input (-1)
+      to: 1
+      window_size: -1
+    - from: 1                   # talker → code2wav: trigger only after receiving full input (-1)
+      to: 2
+      window_size: -1