vllm-omni_0.15.0.rc1+fix1 first commit

c1cacde6 · weishb · 35607782 · c1cacde6 · c1cacde6 · c1cacde6
Commit c1cacde6 authored Mar 25, 2026 by weishb
20 changed files
--- a/examples/offline_inference/text_to_image/text_to_image.py
+++ b/examples/offline_inference/text_to_image/text_to_image.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import os
+import time
+from pathlib import Path
+
+import torch
+
+from vllm_omni.diffusion.data import DiffusionParallelConfig, logger
+from vllm_omni.entrypoints.omni import Omni
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams
+from vllm_omni.outputs import OmniRequestOutput
+from vllm_omni.platforms import current_omni_platform
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Generate an image with Qwen-Image.")
+    parser.add_argument(
+        "--model",
+        default="Qwen/Qwen-Image",
+        help="Diffusion model name or local path. Supported models: "
+        "Qwen/Qwen-Image, Tongyi-MAI/Z-Image-Turbo, Qwen/Qwen-Image-2512",
+    )
+    parser.add_argument("--prompt", default="a cup of coffee on the table", help="Text prompt for image generation.")
+    parser.add_argument(
+        "--negative_prompt",
+        default=None,
+        help="negative prompt for classifier-free conditional guidance.",
+    )
+    parser.add_argument("--seed", type=int, default=142, help="Random seed for deterministic results.")
+    parser.add_argument(
+        "--cfg_scale",
+        type=float,
+        default=4.0,
+        help="True classifier-free guidance scale specific to Qwen-Image.",
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=1.0,
+        help="Classifier-free guidance scale.",
+    )
+    parser.add_argument("--height", type=int, default=1024, help="Height of generated image.")
+    parser.add_argument("--width", type=int, default=1024, help="Width of generated image.")
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="qwen_image_output.png",
+        help="Path to save the generated image (PNG).",
+    )
+    parser.add_argument(
+        "--num_images_per_prompt",
+        type=int,
+        default=1,
+        help="Number of images to generate for the given prompt.",
+    )
+    parser.add_argument(
+        "--num_inference_steps",
+        type=int,
+        default=50,
+        help="Number of denoising steps for the diffusion sampler.",
+    )
+    parser.add_argument(
+        "--cache_backend",
+        type=str,
+        default=None,
+        choices=["cache_dit", "tea_cache"],
+        help=(
+            "Cache backend to use for acceleration. "
+            "Options: 'cache_dit' (DBCache + SCM + TaylorSeer), 'tea_cache' (Timestep Embedding Aware Cache). "
+            "Default: None (no cache acceleration)."
+        ),
+    )
+    parser.add_argument(
+        "--enable-cache-dit-summary",
+        action="store_true",
+        help="Enable cache-dit summary logging after diffusion forward passes.",
+    )
+    parser.add_argument(
+        "--ulysses_degree",
+        type=int,
+        default=1,
+        help="Number of GPUs used for ulysses sequence parallelism.",
+    )
+    parser.add_argument(
+        "--ring_degree",
+        type=int,
+        default=1,
+        help="Number of GPUs used for ring sequence parallelism.",
+    )
+    parser.add_argument(
+        "--cfg_parallel_size",
+        type=int,
+        default=1,
+        choices=[1, 2],
+        help="Number of GPUs used for classifier free guidance parallel size.",
+    )
+    parser.add_argument(
+        "--enforce_eager",
+        action="store_true",
+        help="Disable torch.compile and force eager execution.",
+    )
+    parser.add_argument(
+        "--enable-cpu-offload",
+        action="store_true",
+        help="Enable CPU offloading for diffusion models.",
+    )
+    parser.add_argument(
+        "--enable-layerwise-offload",
+        action="store_true",
+        help="Enable layerwise (blockwise) offloading on DiT modules.",
+    )
+    parser.add_argument(
+        "--layerwise-num-gpu-layers",
+        type=int,
+        default=1,
+        help="Number of ready layers (blocks) to keep on GPU during generation.",
+    )
+    parser.add_argument(
+        "--tensor_parallel_size",
+        type=int,
+        default=1,
+        help="Number of GPUs used for tensor parallelism (TP) inside the DiT.",
+    )
+    parser.add_argument(
+        "--vae_use_slicing",
+        action="store_true",
+        help="Enable VAE slicing for memory optimization.",
+    )
+    parser.add_argument(
+        "--vae_use_tiling",
+        action="store_true",
+        help="Enable VAE tiling for memory optimization.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(args.seed)
+
+    # Configure cache based on backend type
+    cache_config = None
+    if args.cache_backend == "cache_dit":
+        # cache-dit configuration: Hybrid DBCache + SCM + TaylorSeer
+        # All parameters marked with [cache-dit only] in DiffusionCacheConfig
+        cache_config = {
+            # DBCache parameters [cache-dit only]
+            "Fn_compute_blocks": 1,  # Optimized for single-transformer models
+            "Bn_compute_blocks": 0,  # Number of backward compute blocks
+            "max_warmup_steps": 4,  # Maximum warmup steps (works for few-step models)
+            "residual_diff_threshold": 0.24,  # Higher threshold for more aggressive caching
+            "max_continuous_cached_steps": 3,  # Limit to prevent precision degradation
+            # TaylorSeer parameters [cache-dit only]
+            "enable_taylorseer": False,  # Disabled by default (not suitable for few-step models)
+            "taylorseer_order": 1,  # TaylorSeer polynomial order
+            # SCM (Step Computation Masking) parameters [cache-dit only]
+            "scm_steps_mask_policy": None,  # SCM mask policy: None (disabled), "slow", "medium", "fast", "ultra"
+            "scm_steps_policy": "dynamic",  # SCM steps policy: "dynamic" or "static"
+        }
+    elif args.cache_backend == "tea_cache":
+        # TeaCache configuration
+        # All parameters marked with [tea_cache only] in DiffusionCacheConfig
+        cache_config = {
+            # TeaCache parameters [tea_cache only]
+            "rel_l1_thresh": 0.2,  # Threshold for accumulated relative L1 distance
+            # Note: coefficients will use model-specific defaults based on model_type
+            #       (e.g., QwenImagePipeline or FluxPipeline)
+        }
+
+    # assert args.ring_degree == 1, "Ring attention is not supported yet"
+    parallel_config = DiffusionParallelConfig(
+        ulysses_degree=args.ulysses_degree,
+        ring_degree=args.ring_degree,
+        cfg_parallel_size=args.cfg_parallel_size,
+        tensor_parallel_size=args.tensor_parallel_size,
+    )
+
+    # Check if profiling is requested via environment variable
+    profiler_enabled = bool(os.getenv("VLLM_TORCH_PROFILER_DIR"))
+
+    omni = Omni(
+        model=args.model,
+        enable_layerwise_offload=args.enable_layerwise_offload,
+        layerwise_num_gpu_layers=args.layerwise_num_gpu_layers,
+        vae_use_slicing=args.vae_use_slicing,
+        vae_use_tiling=args.vae_use_tiling,
+        cache_backend=args.cache_backend,
+        cache_config=cache_config,
+        enable_cache_dit_summary=args.enable_cache_dit_summary,
+        parallel_config=parallel_config,
+        enforce_eager=args.enforce_eager,
+        enable_cpu_offload=args.enable_cpu_offload,
+    )
+
+    if profiler_enabled:
+        print("[Profiler] Starting profiling...")
+        omni.start_profile()
+
+    # Time profiling for generation
+    print(f"\n{'=' * 60}")
+    print("Generation Configuration:")
+    print(f"  Model: {args.model}")
+    print(f"  Inference steps: {args.num_inference_steps}")
+    print(f"  Cache backend: {args.cache_backend if args.cache_backend else 'None (no acceleration)'}")
+    print(
+        f"  Parallel configuration: tensor_parallel_size={args.tensor_parallel_size}, "
+        f"ulysses_degree={args.ulysses_degree}, ring_degree={args.ring_degree}, cfg_parallel_size={args.cfg_parallel_size}"
+    )
+    print(f"  Image size: {args.width}x{args.height}")
+    print(f"{'=' * 60}\n")
+
+    generation_start = time.perf_counter()
+    outputs = omni.generate(
+        {
+            "prompt": args.prompt,
+            "negative_prompt": args.negative_prompt,
+        },
+        OmniDiffusionSamplingParams(
+            height=args.height,
+            width=args.width,
+            generator=generator,
+            true_cfg_scale=args.cfg_scale,
+            guidance_scale=args.guidance_scale,
+            num_inference_steps=args.num_inference_steps,
+            num_outputs_per_prompt=args.num_images_per_prompt,
+        ),
+    )
+    generation_end = time.perf_counter()
+    generation_time = generation_end - generation_start
+
+    # Print profiling results
+    print(f"Total generation time: {generation_time:.4f} seconds ({generation_time * 1000:.2f} ms)")
+
+    if profiler_enabled:
+        print("\n[Profiler] Stopping profiler and collecting results...")
+        profile_results = omni.stop_profile()
+        if profile_results and isinstance(profile_results, dict):
+            traces = profile_results.get("traces", [])
+            print("\n" + "=" * 60)
+            print("PROFILING RESULTS:")
+            for rank, trace in enumerate(traces):
+                print(f"\nRank {rank}:")
+                if trace:
+                    print(f"  • Trace: {trace}")
+            if not traces:
+                print("  No traces collected.")
+            print("=" * 60)
+        else:
+            print("[Profiler] No valid profiling data returned.")
+
+    # Extract images from OmniRequestOutput
+    # omni.generate() returns list[OmniRequestOutput], extract images from the first output
+    if not outputs or len(outputs) == 0:
+        raise ValueError("No output generated from omni.generate()")
+    logger.info(f"Outputs: {outputs}")
+
+    # Extract images from request_output[0]['images']
+    first_output = outputs[0]
+    if not hasattr(first_output, "request_output") or not first_output.request_output:
+        raise ValueError("No request_output found in OmniRequestOutput")
+
+    req_out = first_output.request_output[0]
+    if not isinstance(req_out, OmniRequestOutput) or not hasattr(req_out, "images"):
+        raise ValueError("Invalid request_output structure or missing 'images' key")
+
+    images = req_out.images
+    if not images:
+        raise ValueError("No images found in request_output")
+
+    output_path = Path(args.output)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    suffix = output_path.suffix or ".png"
+    stem = output_path.stem or "qwen_image_output"
+    if len(images) <= 1:
+        images[0].save(output_path)
+        print(f"Saved generated image to {output_path}")
+    else:
+        for idx, img in enumerate(images):
+            save_path = output_path.parent / f"{stem}_{idx}{suffix}"
+            img.save(save_path)
+            print(f"Saved generated image to {save_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/text_to_video/text_to_video.md
+++ b/examples/offline_inference/text_to_video/text_to_video.md
+# Text-To-Video
+
+The `Wan-AI/Wan2.2-T2V-A14B-Diffusers` pipeline generates short videos from text prompts.
+
+## Local CLI Usage
+
+```bash
+python text_to_video.py \
+  --prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." \
+  --negative_prompt "<optional quality filter>" \
+  --height 480 \
+  --width 832 \
+  --num_frames 33 \
+  --guidance_scale 4.0 \
+  --guidance_scale_high 3.0 \
+  --flow_shift 12.0 \
+  --num_inference_steps 40 \
+  --fps 16 \
+  --output t2v_out.mp4
+```
+
+Key arguments:
+
+- `--prompt`: text description (string).
+- `--height/--width`: output resolution (defaults 480x832, i.e. 480P). Dimensions should align with Wan VAE downsampling (multiples of 8).
+- `--num_frames`: Number of frames (Wan default is 81).
+- `--guidance_scale` and `--guidance_scale_high`: CFG scale (applied to low/high).
+- `--negative_prompt`: optional list of artifacts to suppress (the PR demo used a long Chinese string).
+- `--boundary_ratio`: Boundary split ratio for low/high DiT. Default `0.875` uses both transformers for best quality. Set to `1.0` to load only the low-noise transformer (saves noticeable memory with good quality, recommended if memory is limited). Set to `0.0` loads only the high-noise transformer (not recommended, lower quality).
+- `--fps`: frames per second for the saved MP4 (requires `diffusers` export_to_video).
+- `--output`: path to save the generated video.
+- `--vae_use_slicing`: enable VAE slicing for memory optimization.
+- `--vae_use_tiling`: enable VAE tiling for memory optimization.
+- `--cfg_parallel_size`: set it to 2 to enable CFG Parallel. See more examples in [`user_guide`](../../../docs/user_guide/diffusion/parallelism_acceleration.md#cfg-parallel).
+- `--enable-cpu-offload`: enable CPU offloading for diffusion models.
+
+> ℹ️ If you encounter OOM errors, try using `--vae_use_slicing` and `--vae_use_tiling` to reduce memory usage.
--- a/examples/offline_inference/text_to_video/text_to_video.py
+++ b/examples/offline_inference/text_to_video/text_to_video.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import os
+import time
+from pathlib import Path
+
+import numpy as np
+import torch
+
+from vllm_omni.diffusion.data import DiffusionParallelConfig
+from vllm_omni.entrypoints.omni import Omni
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams
+from vllm_omni.outputs import OmniRequestOutput
+from vllm_omni.platforms import current_omni_platform
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Generate a video with Wan2.2 T2V.")
+    parser.add_argument(
+        "--model",
+        default="Wan-AI/Wan2.2-T2V-A14B-Diffusers",
+        help="Diffusers Wan2.2 model ID or local path.",
+    )
+    parser.add_argument("--prompt", default="A serene lakeside sunrise with mist over the water.", help="Text prompt.")
+    parser.add_argument("--negative_prompt", default="", help="Negative prompt.")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed.")
+    parser.add_argument("--guidance_scale", type=float, default=4.0, help="CFG scale (applied to low/high).")
+    parser.add_argument("--guidance_scale_high", type=float, default=None, help="Optional separate CFG for high-noise.")
+    parser.add_argument("--height", type=int, default=720, help="Video height.")
+    parser.add_argument("--width", type=int, default=1280, help="Video width.")
+    parser.add_argument("--num_frames", type=int, default=81, help="Number of frames (Wan default is 81).")
+    parser.add_argument("--num_inference_steps", type=int, default=40, help="Sampling steps.")
+    parser.add_argument(
+        "--boundary_ratio",
+        type=float,
+        default=0.875,
+        help="Boundary split ratio for low/high DiT. Default 0.875 uses both transformers for best quality. Set to 1.0 to load only the low-noise transformer (saves noticeable memory with good quality, recommended if memory is limited).",
+    )
+    parser.add_argument(
+        "--flow_shift", type=float, default=5.0, help="Scheduler flow_shift (5.0 for 720p, 12.0 for 480p)."
+    )
+    parser.add_argument(
+        "--cache_backend",
+        type=str,
+        default=None,
+        choices=["cache_dit"],
+        help=(
+            "Cache backend to use for acceleration. "
+            "Options: 'cache_dit' (DBCache + SCM + TaylorSeer). "
+            "Default: None (no cache acceleration)."
+        ),
+    )
+    parser.add_argument(
+        "--enable-cache-dit-summary",
+        action="store_true",
+        help="Enable cache-dit summary logging after diffusion forward passes.",
+    )
+    parser.add_argument("--output", type=str, default="wan22_output.mp4", help="Path to save the video (mp4).")
+    parser.add_argument("--fps", type=int, default=24, help="Frames per second for the output video.")
+    parser.add_argument(
+        "--vae_use_slicing",
+        action="store_true",
+        help="Enable VAE slicing for memory optimization.",
+    )
+    parser.add_argument(
+        "--vae_use_tiling",
+        action="store_true",
+        help="Enable VAE tiling for memory optimization.",
+    )
+    parser.add_argument(
+        "--enforce_eager",
+        action="store_true",
+        help="Disable torch.compile and force eager execution.",
+    )
+    parser.add_argument(
+        "--enable-cpu-offload",
+        action="store_true",
+        help="Enable CPU offloading for diffusion models.",
+    )
+    parser.add_argument(
+        "--enable-layerwise-offload",
+        action="store_true",
+        help="Enable layerwise (blockwise) offloading on DiT modules.",
+    )
+    parser.add_argument(
+        "--layerwise-num-gpu-layers",
+        type=int,
+        default=1,
+        help="Number of ready layers (blocks) to keep on GPU during generation.",
+    )
+    parser.add_argument(
+        "--ulysses_degree",
+        type=int,
+        default=1,
+        help="Number of GPUs used for ulysses sequence parallelism.",
+    )
+    parser.add_argument(
+        "--ring_degree",
+        type=int,
+        default=1,
+        help="Number of GPUs used for ring sequence parallelism.",
+    )
+    parser.add_argument(
+        "--cfg_parallel_size",
+        type=int,
+        default=1,
+        choices=[1, 2],
+        help="Number of GPUs used for classifier free guidance parallel size.",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(args.seed)
+
+    # Wan2.2 cache-dit tuning (from cache-dit examples and cache_alignment).
+    cache_config = None
+    if args.cache_backend == "cache_dit":
+        cache_config = {
+            # DBCache parameters [cache-dit only]
+            "Fn_compute_blocks": 1,  # Optimized for single-transformer models
+            "Bn_compute_blocks": 0,  # Number of backward compute blocks
+            "max_warmup_steps": 4,  # Maximum warmup steps (works for few-step models)
+            "max_cached_steps": 20,
+            "residual_diff_threshold": 0.24,  # Higher threshold for more aggressive caching
+            "max_continuous_cached_steps": 3,  # Limit to prevent precision degradation
+            # TaylorSeer parameters [cache-dit only]
+            "enable_taylorseer": False,  # Disabled by default (not suitable for few-step models)
+            "taylorseer_order": 1,  # TaylorSeer polynomial order
+            # SCM (Step Computation Masking) parameters [cache-dit only]
+            "scm_steps_mask_policy": None,  # SCM mask policy: None (disabled), "slow", "medium", "fast", "ultra"
+            "scm_steps_policy": "dynamic",  # SCM steps policy: "dynamic" or "static"
+        }
+    # Configure parallel settings (only SP is supported for Wan)
+    # Note: cfg_parallel and tensor_parallel are not implemented for Wan models
+    parallel_config = DiffusionParallelConfig(
+        ulysses_degree=args.ulysses_degree,
+        ring_degree=args.ring_degree,
+        cfg_parallel_size=args.cfg_parallel_size,
+    )
+
+    # Check if profiling is requested via environment variable
+    profiler_enabled = bool(os.getenv("VLLM_TORCH_PROFILER_DIR"))
+
+    omni = Omni(
+        model=args.model,
+        enable_layerwise_offload=args.enable_layerwise_offload,
+        layerwise_num_gpu_layers=args.layerwise_num_gpu_layers,
+        vae_use_slicing=args.vae_use_slicing,
+        vae_use_tiling=args.vae_use_tiling,
+        boundary_ratio=args.boundary_ratio,
+        flow_shift=args.flow_shift,
+        cache_backend=args.cache_backend,
+        cache_config=cache_config,
+        enable_cache_dit_summary=args.enable_cache_dit_summary,
+        enable_cpu_offload=args.enable_cpu_offload,
+        parallel_config=parallel_config,
+        enforce_eager=args.enforce_eager,
+    )
+
+    if profiler_enabled:
+        print("[Profiler] Starting profiling...")
+        omni.start_profile()
+
+    # Print generation configuration
+    print(f"\n{'=' * 60}")
+    print("Generation Configuration:")
+    print(f"  Model: {args.model}")
+    print(f"  Inference steps: {args.num_inference_steps}")
+    print(f"  Frames: {args.num_frames}")
+    print(
+        f"  Parallel configuration: ulysses_degree={args.ulysses_degree}, ring_degree={args.ring_degree}, cfg_parallel_size={args.cfg_parallel_size}"
+    )
+    print(f"  Video size: {args.width}x{args.height}")
+    print(f"{'=' * 60}\n")
+
+    generation_start = time.perf_counter()
+    frames = omni.generate(
+        {
+            "prompt": args.prompt,
+            "negative_prompt": args.negative_prompt,
+        },
+        OmniDiffusionSamplingParams(
+            height=args.height,
+            width=args.width,
+            generator=generator,
+            guidance_scale=args.guidance_scale,
+            guidance_scale_2=args.guidance_scale_high,
+            num_inference_steps=args.num_inference_steps,
+            num_frames=args.num_frames,
+        ),
+    )
+    generation_end = time.perf_counter()
+    generation_time = generation_end - generation_start
+
+    # Print profiling results
+    print(f"Total generation time: {generation_time:.4f} seconds ({generation_time * 1000:.2f} ms)")
+
+    # Extract video frames from OmniRequestOutput
+    if isinstance(frames, list) and len(frames) > 0:
+        first_item = frames[0]
+
+        # Check if it's an OmniRequestOutput
+        if hasattr(first_item, "final_output_type"):
+            if first_item.final_output_type != "image":
+                raise ValueError(
+                    f"Unexpected output type '{first_item.final_output_type}', expected 'image' for video generation."
+                )
+
+            # Pipeline mode: extract from nested request_output
+            if hasattr(first_item, "is_pipeline_output") and first_item.is_pipeline_output:
+                if isinstance(first_item.request_output, list) and len(first_item.request_output) > 0:
+                    inner_output = first_item.request_output[0]
+                    if isinstance(inner_output, OmniRequestOutput) and hasattr(inner_output, "images"):
+                        frames = inner_output.images[0] if inner_output.images else None
+                        if frames is None:
+                            raise ValueError("No video frames found in output.")
+            # Diffusion mode: use direct images field
+            elif hasattr(first_item, "images") and first_item.images:
+                frames = first_item.images
+            else:
+                raise ValueError("No video frames found in OmniRequestOutput.")
+
+    output_path = Path(args.output)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    try:
+        from diffusers.utils import export_to_video
+    except ImportError:
+        raise ImportError("diffusers is required for export_to_video.")
+
+    # frames may be np.ndarray (preferred) or torch.Tensor
+    # export_to_video expects a list of frames with values in [0, 1]
+    if isinstance(frames, torch.Tensor):
+        video_tensor = frames.detach().cpu()
+        if video_tensor.dim() == 5:
+            # [B, C, F, H, W] or [B, F, H, W, C]
+            if video_tensor.shape[1] in (3, 4):
+                video_tensor = video_tensor[0].permute(1, 2, 3, 0)
+            else:
+                video_tensor = video_tensor[0]
+        elif video_tensor.dim() == 4 and video_tensor.shape[0] in (3, 4):
+            video_tensor = video_tensor.permute(1, 2, 3, 0)
+        # If float, assume [-1,1] and normalize to [0,1]
+        if video_tensor.is_floating_point():
+            video_tensor = video_tensor.clamp(-1, 1) * 0.5 + 0.5
+        video_array = video_tensor.float().numpy()
+    else:
+        video_array = frames
+        if hasattr(video_array, "shape") and video_array.ndim == 5:
+            video_array = video_array[0]
+
+    # Convert 4D array (frames, H, W, C) to list of frames for export_to_video
+    if isinstance(video_array, np.ndarray) and video_array.ndim == 4:
+        video_array = list(video_array)
+
+    export_to_video(video_array, str(output_path), fps=args.fps)
+    print(f"Saved generated video to {output_path}")
+
+    if profiler_enabled:
+        print("\n[Profiler] Stopping profiler and collecting results...")
+        profile_results = omni.stop_profile()
+        if profile_results and isinstance(profile_results, dict):
+            traces = profile_results.get("traces", [])
+            print("\n" + "=" * 60)
+            print("PROFILING RESULTS:")
+            for rank, trace in enumerate(traces):
+                print(f"\nRank {rank}:")
+                if trace:
+                    print(f"  • Trace: {trace}")
+            if not traces:
+                print("  No traces collected.")
+            print("=" * 60)
+        else:
+            print("[Profiler] No valid profiling data returned.")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/bagel/README.md
+++ b/examples/online_serving/bagel/README.md
+# BAGEL-7B-MoT
+
+## 🛠️ Installation
+
+Please refer to [README.md](../../../README.md)
+
+## Run examples (BAGEL-7B-MoT)
+
+**Note**: These examples work with the default configuration on an **NVIDIA A100 (80GB)**. We also tested on dual **NVIDIA RTX 5000 Ada (32GB each)**. For dual-GPU setups, please modify the stage configuration to distribute the model across devices.
+
+### Launch the Server
+
+```bash
+# Use default configuration
+vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091
+```
+
+Or use the convenience script:
+
+```bash
+cd /workspace/vllm-omni/examples/online_serving/bagel
+bash run_server.sh
+```
+
+If you have a custom stage configs file, launch the server with the command below:
+
+```bash
+vllm serve ByteDance-Seed/BAGEL-7B-MoT --omni --port 8091 --stage-configs-path /path/to/stage_configs_file
+```
+
+### Send Multi-modal Request
+
+Get into the bagel folder:
+
+```bash
+cd examples/online_serving/bagel
+```
+
+Send request via Python
+
+```bash
+python openai_chat_client.py --prompt "A cute cat" --modality text2img
+```
+
+The Python client supports the following command-line arguments:
+
+- `--prompt` (or `-p`): Text prompt for generation (default: `A cute cat`)
+- `--output` (or `-o`): Output file path for image results (default: `bagel_output.png`)
+- `--server` (or `-s`): Server URL (default: `http://localhost:8091`)
+- `--image-url` (or `-i`): Input image URL or local file path (for img2img/img2text modes)
+- `--modality` (or `-m`): Task modality (default: `text2img`). Options: `text2img`, `img2img`, `img2text`, `text2text`
+- `--height`: Image height in pixels (default: 512)
+- `--width`: Image width in pixels (default: 512)
+- `--steps`: Number of inference steps (default: 25)
+- `--seed`: Random seed (default: 42)
+- `--negative`: Negative prompt for image generation
+
+Example with custom parameters:
+
+```bash
+python openai_chat_client.py \
+    --prompt "A futuristic city" \
+    --modality text2img \
+    --height 768 \
+    --width 768 \
+    --steps 50 \
+    --seed 42 \
+    --negative "blurry, low quality"
+```
+
+## Modality Control
+
+BAGEL-7B-MoT supports **multiple modality modes** for different use cases.
+
+The default yaml configuration deploys Thinker and DiT on the same GPU. You can use the default configuration file: [`bagel.yaml`](../../../vllm_omni/model_executor/stage_configs/bagel.yaml)
+
+| Modality    | Input        | Output | Description                            |
+| ----------- | ------------ | ------ | -------------------------------------- |
+| `text2img`  | Text         | Image  | Generate images from text prompts      |
+| `img2img`   | Image + Text | Image  | Transform images using text guidance   |
+| `img2text`  | Image + Text | Text   | Generate text descriptions from images |
+| `text2text` | Text         | Text   | Pure text generation                   |
+
+### Text to Image (text2img)
+
+Generate images from text prompts:
+
+**Using Python client**
+
+```bash
+python openai_chat_client.py \
+    --prompt "A beautiful sunset over mountains" \
+    --modality text2img \
+    --output sunset.png \
+    --steps 50
+```
+
+**Using curl**
+
+```bash
+curl http://localhost:8091/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [{"role": "user", "content": [{"type": "text", "text": "<|im_start|>A beautiful sunset over mountains<|im_end|>"}]}],
+    "modalities": ["image"],
+    "height": 512,
+    "width": 512,
+    "num_inference_steps": 50,
+    "seed": 42
+  }'
+```
+
+
+### Image to Image (img2img)
+
+Transform images based on text prompts:
+
+**Using Python client**
+
+```bash
+python openai_chat_client.py \
+    --prompt "Make the cat stand up" \
+    --modality img2img \
+    --image-url /path/to/input.jpg \
+    --output transformed.png
+```
+
+**Using curl**
+
+```bash
+IMAGE_BASE64=$(base64 -w 0 cat.jpg)
+
+cat <<EOF > payload.json
+{
+    "messages": [{
+      "role": "user",
+      "content": [
+        {"type": "text", "text": "<|im_start|>Make the cat stand up<|im_end|>"},
+        {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,${IMAGE_BASE64}"}}
+      ]
+    }],
+    "modalities": ["image"],
+    "height": 512,
+    "width": 512,
+    "num_inference_steps": 50,
+    "seed": 42
+}
+EOF
+
+curl http://localhost:8091/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d @payload.json
+
+```
+
+### Image to Text (img2text)
+
+Generate text descriptions from images:
+
+**Using Python client**
+
+```bash
+python openai_chat_client.py \
+    --prompt "Describe this image in detail" \
+    --modality img2text \
+    --image-url /path/to/image.jpg
+```
+
+**Using curl**
+
+```bash
+IMAGE_BASE64=$(base64 -w 0 cat.jpg)
+
+cat <<EOF > payload.json
+{
+  "messages": [{
+    "role": "user",
+    "content": [
+      {"type": "text", "text": "<|im_start|>user\n<|image_pad|>\nDescribe this image in detail<|im_end|>\n<|im_start|>assistant\n"},
+      {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,${IMAGE_BASE64}"}}
+    ]
+  }],
+  "modalities": ["text"]
+}
+EOF
+
+curl http://localhost:8091/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d @payload.json
+```
+
+### Text to Text (text2text)
+
+Pure text generation:
+
+**Using Python client**
+
+```bash
+python openai_chat_client.py \
+    --prompt "What is the capital of France?" \
+    --modality text2text
+```
+
+**Using curl**
+
+```bash
+curl http://localhost:8091/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [{"role": "user", "content": [{"type": "text", "text": "<|im_start|>user\nWhat is the capital of France?<|im_end|>\n<|im_start|>assistant\n"}]}]
+    "modalities": ["text"]
+  }'
+```
+
+## FAQ
+
+- If you encounter an error about the backend of librosa, try to install ffmpeg with the command below.
+
+```bash
+sudo apt update
+sudo apt install ffmpeg
+```
+
+- If you don’t know how much VRAM is needed for the model or encounter the OOM error, you can try to decrease the max_model_len.
+
+| Stage               | VRAM                         |
+| :------------------ | :--------------------------- |
+| Stage-0 (Thinker)   | **15.04 GiB** **+ KV Cache** |
+| Stage-1 (DiT)       | **26.50 GiB**                |
+| Total               | **~42 GiB + KV Cache**       |
--- a/examples/online_serving/bagel/openai_chat_client.py
+++ b/examples/online_serving/bagel/openai_chat_client.py
+#!/usr/bin/env python3
+"""
+Bagel OpenAI-compatible chat client for image generation and multimodal tasks.
+
+Usage:
+    python openai_chat_client.py --prompt "A cute cat" --output output.png
+    python openai_chat_client.py --prompt "Describe this image" --image-url https://example.com/image.png
+"""
+
+import argparse
+import base64
+from pathlib import Path
+
+import requests
+
+
+def generate_image(
+    prompt: str,
+    server_url: str = "http://localhost:8091",
+    image_url: str | None = None,
+    height: int | None = None,
+    width: int | None = None,
+    steps: int | None = None,
+    seed: int | None = None,
+    negative_prompt: str | None = None,
+    modality: str = "text2img",  # "text2img" (default), "img2img", "img2text", "text2text"
+) -> bytes | str | None:
+    """Generate an image or text using the chat completions API.
+
+    Args:
+        prompt: Text description or prompt
+        server_url: Server URL
+        image_url: URL or path to input image (for img2img/img2text)
+        height: Image height in pixels
+        width: Image width in pixels
+        steps: Number of inference steps
+        seed: Random seed
+        negative_prompt: Negative prompt
+        modality: Task modality hint
+
+    Returns:
+        Image bytes (for image outputs) or Text string (for text outputs) or None if failed
+    """
+
+    # Construct Message Content
+    content = [{"type": "text", "text": f"<|im_start|>{prompt}<|im_end|>"}]
+
+    if image_url:
+        # Check if local file
+        if Path(image_url).exists():
+            with open(image_url, "rb") as f:
+                b64_data = base64.b64encode(f.read()).decode("utf-8")
+                final_image_url = f"data:image/jpeg;base64,{b64_data}"
+        else:
+            final_image_url = image_url
+
+        content.append({"type": "image_url", "image_url": {"url": final_image_url}})
+
+    messages = [{"role": "user", "content": content}]
+
+    # Build request payload with all parameters at top level
+    # Note: vLLM ignores "extra_body", so we put parameters directly in the payload
+    payload = {"messages": messages}
+
+    # Set output modalities at top level
+    if modality == "text2img" or modality == "img2img":
+        payload["modalities"] = ["image"]
+    elif modality == "img2text" or modality == "text2text":
+        payload["modalities"] = ["text"]
+
+    # Add generation parameters directly to payload
+    if height is not None:
+        payload["height"] = height
+    if width is not None:
+        payload["width"] = width
+    if steps is not None:
+        payload["num_inference_steps"] = steps
+    if seed is not None:
+        payload["seed"] = seed
+    if negative_prompt:
+        payload["negative_prompt"] = negative_prompt
+
+    # Send request
+    try:
+        print(f"Sending request to {server_url} with modality {modality}...")
+        response = requests.post(
+            f"{server_url}/v1/chat/completions",
+            headers={"Content-Type": "application/json"},
+            json=payload,
+            timeout=300,
+        )
+        response.raise_for_status()
+        data = response.json()
+
+        # Extract content - check ALL choices since server may return multiple
+        # (e.g., text in choices[0], image in choices[1])
+        choices = data.get("choices", [])
+
+        # First pass: look for image output in any choice
+        for choice in choices:
+            choice_content = choice.get("message", {}).get("content")
+
+            # Handle Image Output
+            if isinstance(choice_content, list) and len(choice_content) > 0:
+                first_item = choice_content[0]
+                if isinstance(first_item, dict) and "image_url" in first_item:
+                    img_url_str = first_item["image_url"].get("url", "")
+                    if img_url_str.startswith("data:image"):
+                        _, b64_data = img_url_str.split(",", 1)
+                        return base64.b64decode(b64_data)
+
+        # Second pass: look for text output if no image found
+        for choice in choices:
+            choice_content = choice.get("message", {}).get("content")
+            if isinstance(choice_content, str) and choice_content:
+                return choice_content
+
+        print(f"Unexpected response format: {choices}")
+        return None
+
+    except Exception as e:
+        print(f"Error: {e}")
+        return None
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Bagel multimodal chat client")
+    parser.add_argument("--prompt", "-p", default="<|im_start|>A cute cat<|im_end|>", help="Text prompt")
+    parser.add_argument("--output", "-o", default="bagel_output.png", help="Output file (for image results)")
+    parser.add_argument("--server", "-s", default="http://localhost:8091", help="Server URL")
+
+    # Modality Control
+    parser.add_argument("--image-url", "-i", type=str, help="Input image URL or local path")
+    parser.add_argument(
+        "--modality",
+        "-m",
+        default="text2img",
+        choices=["text2img", "img2img", "img2text", "text2text"],
+        help="Task modality",
+    )
+
+    # Generation Params
+    parser.add_argument("--height", type=int, default=512, help="Image height")
+    parser.add_argument("--width", type=int, default=512, help="Image width")
+    parser.add_argument("--steps", type=int, default=25, help="Inference steps")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    parser.add_argument("--negative", help="Negative prompt")
+
+    args = parser.parse_args()
+
+    print(f"Mode: {args.modality}")
+    if args.image_url:
+        print(f"Input Image: {args.image_url}")
+
+    result = generate_image(
+        prompt=args.prompt,
+        server_url=args.server,
+        image_url=args.image_url,
+        height=args.height,
+        width=args.width,
+        steps=args.steps,
+        seed=args.seed,
+        negative_prompt=args.negative,
+        modality=args.modality,
+    )
+
+    if result:
+        if isinstance(result, bytes):
+            # It's an image
+            output_path = Path(args.output)
+            output_path.write_bytes(result)
+            print(f"Image saved to: {output_path}")
+            print(f"Size: {len(result) / 1024:.1f} KB")
+        elif isinstance(result, str):
+            # It's text
+            print("Response:")
+            print(result)
+    else:
+        print("Failed to generate response")
+        exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/bagel/run_server.sh
+++ b/examples/online_serving/bagel/run_server.sh
+#!/bin/bash
+# Bagel online serving startup script
+
+MODEL="${MODEL:-ByteDance-Seed/BAGEL-7B-MoT}"
+PORT="${PORT:-8091}"
+
+echo "Starting Bagel server..."
+echo "Model: $MODEL"
+echo "Port: $PORT"
+
+vllm serve "$MODEL" --omni \
+    --port "$PORT"
--- a/examples/online_serving/image_to_image/README.md
+++ b/examples/online_serving/image_to_image/README.md
+# Image-To-Image
+
+This example demonstrates how to deploy Qwen-Image-Edit model for online image editing service using vLLM-Omni.
+
+For **multi-image** input editing, use **Qwen-Image-Edit-2509** (QwenImageEditPlusPipeline) and send multiple images in the user message content.
+
+## Start Server
+
+### Basic Start
+
+```bash
+vllm serve Qwen/Qwen-Image-Edit --omni --port 8092
+```
+
+### Multi-Image Edit (Qwen-Image-Edit-2509)
+
+```bash
+vllm serve Qwen/Qwen-Image-Edit-2509 --omni --port 8092
+```
+
+### Start with Parameters
+
+
+Or use the startup script:
+
+```bash
+bash run_server.sh
+```
+
+To serve Qwen-Image-Edit-2509 with the script:
+
+```bash
+MODEL=Qwen/Qwen-Image-Edit-2509 bash run_server.sh
+```
+
+## API Calls
+
+### Method 1: Using curl (Image Editing)
+
+```bash
+# Image editing
+bash run_curl_image_edit.sh input.png "Convert this image to watercolor style"
+
+# Or execute directly
+IMG_B64=$(base64 -w0 input.png)
+
+cat <<EOF > request.json
+{
+  "messages": [{
+    "role": "user",
+    "content": [
+      {"type": "text", "text": "Convert this image to watercolor style"},
+      {"type": "image_url", "image_url": {"url": "data:image/png;base64,$IMG_B64"}}
+    ]
+  }],
+  "extra_body": {
+    "height": 1024,
+    "width": 1024,
+    "num_inference_steps": 50,
+    "guidance_scale": 1,
+    "seed": 42
+  }
+}
+EOF
+
+curl -s http://localhost:8092/v1/chat/completions   -H "Content-Type: application/json"   -d @request.json | jq -r '.choices[0].message.content[0].image_url.url' | cut -d',' -f2 | base64 -d > output.png
+```
+
+### Method 2: Using Python Client
+
+```bash
+python openai_chat_client.py --input input.png --prompt "Convert to oil painting style" --output output.png
+
+# Multi-image editing (Qwen-Image-Edit-2509 server required)
+python openai_chat_client.py --input input1.png input2.png --prompt "Combine these images into a single scene" --output output.png
+```
+
+### Method 3: Using Gradio Demo
+
+```bash
+python gradio_demo.py
+# Visit http://localhost:7861
+```
+
+## Request Format
+
+### Image Editing (Using image_url Format)
+
+```json
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": [
+        {"type": "text", "text": "Convert this image to watercolor style"},
+        {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}
+      ]
+    }
+  ]
+}
+```
+
+### Image Editing (Using Simplified image Format)
+
+```json
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": [
+        {"text": "Convert this image to watercolor style"},
+        {"image": "BASE64_IMAGE_DATA"}
+      ]
+    }
+  ]
+}
+```
+
+### Image Editing with Parameters
+
+Use `extra_body` to pass generation parameters:
+
+```json
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": [
+        {"type": "text", "text": "Convert to ink wash painting style"},
+        {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}
+      ]
+    }
+  ],
+  "extra_body": {
+    "height": 1024,
+    "width": 1024,
+    "num_inference_steps": 50,
+    "guidance_scale": 7.5,
+    "seed": 42
+  }
+}
+```
+
+### Multi-Image Editing (Qwen-Image-Edit-2509)
+
+Provide multiple images in `content` (order matters):
+
+```json
+{
+  "messages": [
+    {
+      "role": "user",
+      "content": [
+        {"type": "text", "text": "Combine these images into a single scene"},
+        {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."} },
+        {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."} }
+      ]
+    }
+  ]
+}
+```
+
+## Generation Parameters (extra_body)
+
+| Parameter                | Type  | Default | Description                           |
+| ------------------------ | ----- | ------- | ------------------------------------- |
+| `height`                 | int   | None    | Output image height in pixels         |
+| `width`                  | int   | None    | Output image width in pixels          |
+| `size`                   | str   | None    | Output image size (e.g., "1024x1024") |
+| `num_inference_steps`    | int   | 50      | Number of denoising steps             |
+| `guidance_scale`         | float | 7.5     | CFG guidance scale                    |
+| `seed`                   | int   | None    | Random seed (reproducible)            |
+| `negative_prompt`        | str   | None    | Negative prompt                       |
+| `num_outputs_per_prompt` | int   | 1       | Number of images to generate          |
+
+## Response Format
+
+```json
+{
+  "id": "chatcmpl-xxx",
+  "created": 1234567890,
+  "model": "Qwen/Qwen-Image-Edit",
+  "choices": [{
+    "index": 0,
+    "message": {
+      "role": "assistant",
+      "content": [{
+        "type": "image_url",
+        "image_url": {
+          "url": "data:image/png;base64,..."
+        }
+      }]
+    },
+    "finish_reason": "stop"
+  }],
+  "usage": {...}
+}
+```
+
+## Common Editing Instructions Examples
+
+| Instruction                              | Description      |
+| ---------------------------------------- | ---------------- |
+| `Convert this image to watercolor style` | Style transfer   |
+| `Convert the image to black and white`   | Desaturation     |
+| `Enhance the color saturation`           | Color adjustment |
+| `Convert to cartoon style`               | Cartoonization   |
+| `Add vintage filter effect`              | Filter effect    |
+| `Convert daytime scene to nighttime`     | Scene conversion |
+
+## File Description
+
+| File                     | Description                  |
+| ------------------------ | ---------------------------- |
+| `run_server.sh`          | Server startup script        |
+| `run_curl_image_edit.sh` | curl image editing example   |
+| `openai_chat_client.py`  | Python client                |
+| `gradio_demo.py`         | Gradio interactive interface |
--- a/examples/online_serving/image_to_image/gradio_demo.py
+++ b/examples/online_serving/image_to_image/gradio_demo.py
+#!/usr/bin/env python3
+"""
+Qwen-Image-Edit Gradio Demo for online serving.
+
+Usage:
+    python gradio_demo.py [--server http://localhost:8092] [--port 7861]
+"""
+
+import argparse
+import base64
+from io import BytesIO
+
+import gradio as gr
+import requests
+from PIL import Image
+
+
+def _pil_to_b64_png(img: Image.Image) -> str:
+    buffer = BytesIO()
+    img.save(buffer, format="PNG")
+    return base64.b64encode(buffer.getvalue()).decode("utf-8")
+
+
+def edit_image(
+    input_image: Image.Image,
+    extra_images: list[str] | None,
+    prompt: str,
+    steps: int,
+    guidance_scale: float,
+    seed: int | None,
+    negative_prompt: str,
+    server_url: str,
+) -> Image.Image | None:
+    """Edit an image using the chat completions API."""
+    if input_image is None:
+        raise gr.Error("Please upload an image first")
+
+    images: list[Image.Image] = [input_image]
+    if extra_images:
+        for p in extra_images:
+            try:
+                images.append(Image.open(p).convert("RGB"))
+            except Exception as e:
+                raise gr.Error(f"Failed to open image: {p}. Error: {e}") from e
+
+    # Build user message with text and image
+    content: list[dict[str, object]] = [{"type": "text", "text": prompt}]
+    for img in images:
+        content.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{_pil_to_b64_png(img)}"}})
+
+    messages = [
+        {
+            "role": "user",
+            "content": content,
+        }
+    ]
+
+    # Build extra_body with generation parameters
+    extra_body = {
+        "num_inference_steps": steps,
+        "guidance_scale": guidance_scale,
+    }
+    if seed is not None and seed >= 0:
+        extra_body["seed"] = seed
+    if negative_prompt:
+        extra_body["negative_prompt"] = negative_prompt
+
+    # Build request payload
+    payload = {"messages": messages, "extra_body": extra_body}
+
+    try:
+        response = requests.post(
+            f"{server_url}/v1/chat/completions",
+            headers={"Content-Type": "application/json"},
+            json=payload,
+            timeout=300,
+        )
+        response.raise_for_status()
+        data = response.json()
+
+        content = data["choices"][0]["message"]["content"]
+        if isinstance(content, list) and len(content) > 0:
+            image_url = content[0].get("image_url", {}).get("url", "")
+            if image_url.startswith("data:image"):
+                _, b64_data = image_url.split(",", 1)
+                image_bytes = base64.b64decode(b64_data)
+                return Image.open(BytesIO(image_bytes))
+
+        return None
+
+    except Exception as e:
+        print(f"Error: {e}")
+        raise gr.Error(f"Edit failed: {e}")
+
+
+def create_demo(server_url: str):
+    """Create Gradio demo interface."""
+
+    with gr.Blocks(title="Qwen-Image-Edit Demo") as demo:
+        gr.Markdown("# Qwen-Image-Edit Online Editing")
+        gr.Markdown(
+            "Upload an image and describe the editing effect you want. "
+            "For multi-image editing, upload extra images (requires Qwen-Image-Edit-2509 server)."
+        )
+
+        with gr.Row():
+            with gr.Column(scale=1):
+                input_image = gr.Image(
+                    label="Input Image",
+                    type="pil",
+                )
+                extra_images = gr.File(
+                    label="Additional Images (Optional)",
+                    file_count="multiple",
+                    type="filepath",
+                )
+                prompt = gr.Textbox(
+                    label="Edit Instruction",
+                    placeholder="Describe the editing effect you want...",
+                    lines=2,
+                )
+                negative_prompt = gr.Textbox(
+                    label="Negative Prompt",
+                    placeholder="Describe what you don't want...",
+                    lines=2,
+                )
+
+                with gr.Row():
+                    steps = gr.Slider(
+                        label="Inference Steps",
+                        minimum=10,
+                        maximum=100,
+                        value=50,
+                        step=5,
+                    )
+                    guidance_scale = gr.Slider(
+                        label="Guidance Scale (CFG)",
+                        minimum=1.0,
+                        maximum=20.0,
+                        value=7.5,
+                        step=0.5,
+                    )
+
+                with gr.Row():
+                    seed = gr.Number(
+                        label="Random Seed (-1 for random)",
+                        value=-1,
+                        precision=0,
+                    )
+
+                edit_btn = gr.Button("Edit Image", variant="primary")
+
+            with gr.Column(scale=1):
+                output_image = gr.Image(
+                    label="Edited Image",
+                    type="pil",
+                )
+
+        # Examples
+        gr.Examples(
+            examples=[
+                ["Convert this image to watercolor style"],
+                ["Convert the image to black and white"],
+                ["Enhance the color saturation"],
+                ["Convert to cartoon style"],
+                ["Add vintage filter effect"],
+                ["Convert daytime to nighttime"],
+                ["Convert to oil painting style"],
+                ["Add dreamy blur effect"],
+            ],
+            inputs=[prompt],
+        )
+
+        def process_edit(img, imgs, p, st, g, se, n):
+            actual_seed = se if se >= 0 else None
+            return edit_image(img, imgs, p, st, g, actual_seed, n, server_url)
+
+        edit_btn.click(
+            fn=process_edit,
+            inputs=[input_image, extra_images, prompt, steps, guidance_scale, seed, negative_prompt],
+            outputs=[output_image],
+        )
+
+    return demo
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Qwen-Image-Edit Gradio Demo")
+    parser.add_argument("--server", default="http://localhost:8092", help="Server URL")
+    parser.add_argument("--port", type=int, default=7861, help="Gradio port")
+    parser.add_argument("--share", action="store_true", help="Create public link")
+
+    args = parser.parse_args()
+
+    print(f"Connecting to server: {args.server}")
+    demo = create_demo(args.server)
+    demo.launch(server_port=args.port, share=args.share)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/image_to_image/openai_chat_client.py
+++ b/examples/online_serving/image_to_image/openai_chat_client.py
+#!/usr/bin/env python3
+"""
+Qwen-Image-Edit OpenAI-compatible chat client for image editing.
+
+Usage:
+    python openai_chat_client.py --input qwen_image_output.png --prompt "Convert to watercolor style" --output output.png
+    python openai_chat_client.py --input input.png --prompt "Convert to oil painting" --seed 42
+    python openai_chat_client.py --input input1.png input2.png --prompt "Combine these images into a single scene"
+"""
+
+import argparse
+import base64
+from io import BytesIO
+from pathlib import Path
+
+import requests
+from PIL import Image
+
+
+def _encode_image_as_data_url(input_path: Path) -> str:
+    image_bytes = input_path.read_bytes()
+    try:
+        img = Image.open(BytesIO(image_bytes))
+        mime_type = f"image/{img.format.lower()}" if img.format else "image/png"
+    except Exception:
+        mime_type = "image/png"
+    image_b64 = base64.b64encode(image_bytes).decode("utf-8")
+    return f"data:{mime_type};base64,{image_b64}"
+
+
+def edit_image(
+    input_image: str | Path | list[str | Path],
+    prompt: str,
+    server_url: str = "http://localhost:8092",
+    height: int | None = None,
+    width: int | None = None,
+    steps: int | None = None,
+    guidance_scale: float | None = None,
+    seed: int | None = None,
+    negative_prompt: str | None = None,
+) -> bytes | None:
+    """Edit an image using the chat completions API.
+
+    Args:
+        input_image: Path(s) to input image(s). For multi-image editing, pass multiple paths.
+        prompt: Text description of the edit
+        server_url: Server URL
+        height: Output image height in pixels
+        width: Output image width in pixels
+        steps: Number of inference steps
+        guidance_scale: CFG guidance scale
+        seed: Random seed
+        negative_prompt: Negative prompt
+
+    Returns:
+        Edited image bytes or None if failed
+    """
+    input_images = input_image if isinstance(input_image, list) else [input_image]
+    input_paths = [Path(p) for p in input_images]
+    for p in input_paths:
+        if not p.exists():
+            print(f"Error: Input image not found: {p}")
+            return None
+
+    # Build user message with text and image
+    content: list[dict[str, object]] = [{"type": "text", "text": prompt}]
+    for p in input_paths:
+        content.append({"type": "image_url", "image_url": {"url": _encode_image_as_data_url(p)}})
+
+    messages = [
+        {
+            "role": "user",
+            "content": content,
+        }
+    ]
+
+    # Build extra_body with generation parameters
+    extra_body = {}
+    if steps is not None:
+        extra_body["num_inference_steps"] = steps
+    if guidance_scale is not None:
+        extra_body["guidance_scale"] = guidance_scale
+    if seed is not None:
+        extra_body["seed"] = seed
+    if negative_prompt:
+        extra_body["negative_prompt"] = negative_prompt
+
+    # Build request payload
+    payload = {"messages": messages}
+    if extra_body:
+        payload["extra_body"] = extra_body
+
+    # Send request
+    try:
+        response = requests.post(
+            f"{server_url}/v1/chat/completions",
+            headers={"Content-Type": "application/json"},
+            json=payload,
+            timeout=300,
+        )
+        response.raise_for_status()
+        data = response.json()
+
+        # Extract image from response
+        content = data["choices"][0]["message"]["content"]
+        if isinstance(content, list) and len(content) > 0:
+            image_url = content[0].get("image_url", {}).get("url", "")
+            if image_url.startswith("data:image"):
+                _, b64_data = image_url.split(",", 1)
+                return base64.b64decode(b64_data)
+
+        print(f"Unexpected response format: {content}")
+        return None
+
+    except Exception as e:
+        print(f"Error: {e}")
+        return None
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Qwen-Image-Edit chat client")
+    parser.add_argument("--input", "-i", required=True, nargs="+", help="Input image path(s)")
+    parser.add_argument("--prompt", "-p", required=True, help="Edit prompt")
+    parser.add_argument("--output", "-o", default="output.png", help="Output file")
+    parser.add_argument("--server", "-s", default="http://localhost:8092", help="Server URL")
+    parser.add_argument("--height", type=int, default=1024, help="Output image height")
+    parser.add_argument("--width", type=int, default=1024, help="Output image width")
+    parser.add_argument("--steps", type=int, default=50, help="Inference steps")
+    parser.add_argument("--guidance", type=float, default=7.5, help="Guidance scale")
+    parser.add_argument("--seed", type=int, default=0, help="Random seed")
+    parser.add_argument("--negative", help="Negative prompt")
+
+    args = parser.parse_args()
+
+    if len(args.input) == 1:
+        print(f"Input: {args.input[0]}")
+    else:
+        print(f"Inputs ({len(args.input)}): {', '.join(args.input)}")
+    print(f"Prompt: {args.prompt}")
+
+    image_bytes = edit_image(
+        input_image=args.input,
+        prompt=args.prompt,
+        server_url=args.server,
+        height=args.height,
+        width=args.width,
+        steps=args.steps,
+        guidance_scale=args.guidance,
+        seed=args.seed,
+        negative_prompt=args.negative,
+    )
+
+    if image_bytes:
+        output_path = Path(args.output)
+        output_path.write_bytes(image_bytes)
+        print(f"Image saved to: {output_path}")
+        print(f"Size: {len(image_bytes) / 1024:.1f} KB")
+    else:
+        print("Failed to edit image")
+        exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/image_to_image/run_curl_image_edit.sh
+++ b/examples/online_serving/image_to_image/run_curl_image_edit.sh
+#!/bin/bash
+# Qwen-Image image-edit (image-to-image) curl example
+
+set -euo pipefail
+
+if [[ $# -lt 2 ]]; then
+  echo "Usage: $0 <input_image> \"<edit_prompt>\" [output_file]" >&2
+  exit 1
+fi
+
+INPUT_IMG=$1
+PROMPT=$2
+SERVER="${SERVER:-http://localhost:8092}"
+CURRENT_TIME=$(date +%Y%m%d%H%M%S)
+OUTPUT="${3:-image_edit_${CURRENT_TIME}.png}"
+
+if [[ ! -f "$INPUT_IMG" ]]; then
+  echo "Input image not found: $INPUT_IMG" >&2
+  exit 1
+fi
+
+IMG_B64=$(base64 -w0 "$INPUT_IMG")
+
+REQUEST_JSON=$(
+  jq -n --arg prompt "$PROMPT" --arg img "$IMG_B64" '{
+    messages: [{
+      role: "user",
+      content: [
+        {"type": "text", "text": $prompt},
+        {"type": "image_url", "image_url": {"url": ("data:image/png;base64," + $img)}}
+      ]
+    }],
+    extra_body: {
+      height: 1024,
+      width: 1024,
+      num_inference_steps: 50,
+      guidance_scale: 1,
+      seed: 42
+    }
+  }'
+)
+
+echo "Generating edited image..."
+echo "Server: $SERVER"
+echo "Prompt: $PROMPT"
+echo "Input : $INPUT_IMG"
+echo "Output: $OUTPUT"
+
+curl -s "$SERVER/v1/chat/completions" \
+  -H "Content-Type: application/json" \
+  -d "$REQUEST_JSON" \
+  | jq -r '.choices[0].message.content[0].image_url.url' \
+  | cut -d',' -f2 \
+  | base64 -d > "$OUTPUT"
+
+if [[ -f "$OUTPUT" ]]; then
+  echo "Image saved to: $OUTPUT"
+  echo "Size: $(du -h "$OUTPUT" | cut -f1)"
+else
+  echo "Failed to generate image"
+  exit 1
+fi
--- a/examples/online_serving/image_to_image/run_server.sh
+++ b/examples/online_serving/image_to_image/run_server.sh
+#!/bin/bash
+# Qwen-Image-Edit online serving startup script
+
+MODEL="${MODEL:-Qwen/Qwen-Image-Edit}"
+PORT="${PORT:-8092}"
+
+echo "Starting Qwen-Image-Edit server..."
+echo "Model: $MODEL"
+echo "Port: $PORT"
+
+vllm serve "$MODEL" --omni \
+    --port "$PORT"
--- a/examples/online_serving/lora_inference/README.md
+++ b/examples/online_serving/lora_inference/README.md
+# Online LoRA Inference (Diffusion)
+
+This example shows how to use **per-request LoRA** with vLLM-Omni diffusion models via the OpenAI-compatible Chat Completions API.
+
+> Note: The LoRA adapter path must be readable on the **server** machine (usually a local path or a mounted directory).
+> Note: This example uses `/v1/chat/completions`. LoRA payloads for other OpenAI endpoints are not implemented here.
+
+## Start Server
+
+```bash
+# Pick a diffusion model (examples)
+# export MODEL=stabilityai/stable-diffusion-3.5-medium
+# export MODEL=Qwen/Qwen-Image
+
+bash run_server.sh
+```
+
+## Call API (curl)
+
+```bash
+# Required: local LoRA folder on the server
+export LORA_PATH=/path/to/lora_adapter
+
+# Optional
+export SERVER=http://localhost:8091
+export PROMPT="A piece of cheesecake"
+export LORA_NAME=my_lora
+export LORA_SCALE=1.0
+# Optional: if omitted, the server derives a stable id from LORA_PATH.
+# export LORA_INT_ID=123
+
+bash run_curl_lora_inference.sh
+```
+
+## Call API (Python)
+
+```bash
+python openai_chat_client.py \
+  --prompt "A piece of cheesecake" \
+  --lora-path /path/to/lora_adapter \
+  --lora-name my_lora \
+  --lora-scale 1.0 \
+  --output output.png
+```
+
+## LoRA Format
+
+LoRA adapters should be in PEFT format, for example:
+
+```
+lora_adapter/
+├── adapter_config.json
+└── adapter_model.safetensors
+```
--- a/examples/online_serving/lora_inference/openai_chat_client.py
+++ b/examples/online_serving/lora_inference/openai_chat_client.py
+#!/usr/bin/env python3
+"""
+OpenAI-compatible chat client for diffusion LoRA inference.
+
+Example:
+  python openai_chat_client.py \
+    --server http://localhost:8091 \
+    --prompt "A piece of cheesecake" \
+    --lora-path /path/to/lora_adapter \
+    --lora-name my_lora \
+    --lora-scale 1.0 \
+    --output output.png
+"""
+
+import argparse
+import base64
+from pathlib import Path
+
+import requests
+
+
+def generate_image(
+    *,
+    prompt: str,
+    server_url: str,
+    height: int | None,
+    width: int | None,
+    num_inference_steps: int | None,
+    seed: int | None,
+    lora_name: str | None,
+    lora_path: str | None,
+    lora_scale: float | None,
+    lora_int_id: int | None,
+) -> bytes | None:
+    messages = [{"role": "user", "content": prompt}]
+
+    extra_body: dict = {}
+    if height is not None:
+        extra_body["height"] = height
+    if width is not None:
+        extra_body["width"] = width
+    if num_inference_steps is not None:
+        extra_body["num_inference_steps"] = num_inference_steps
+    if seed is not None:
+        extra_body["seed"] = seed
+
+    if lora_path:
+        lora_body: dict = {
+            "local_path": lora_path,
+            "name": lora_name or Path(lora_path).stem,
+        }
+        if lora_scale is not None:
+            lora_body["scale"] = float(lora_scale)
+        if lora_int_id is not None:
+            lora_body["int_id"] = int(lora_int_id)
+        extra_body["lora"] = lora_body
+
+    payload = {"messages": messages}
+    if extra_body:
+        payload["extra_body"] = extra_body
+
+    response = requests.post(
+        f"{server_url}/v1/chat/completions",
+        headers={"Content-Type": "application/json"},
+        json=payload,
+        timeout=300,
+    )
+    response.raise_for_status()
+    data = response.json()
+
+    content = data["choices"][0]["message"]["content"]
+    if isinstance(content, list) and content:
+        image_url = content[0].get("image_url", {}).get("url", "")
+        if image_url.startswith("data:image"):
+            _, b64_data = image_url.split(",", 1)
+            return base64.b64decode(b64_data)
+
+    raise RuntimeError(f"Unexpected response format: {content!r}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Diffusion LoRA OpenAI chat client")
+    parser.add_argument("--server", default="http://localhost:8091", help="Server URL")
+    parser.add_argument("--prompt", default="A piece of cheesecake", help="Text prompt")
+    parser.add_argument("--output", default="lora_online_output.png", help="Output image path")
+
+    parser.add_argument("--height", type=int, default=1024, help="Image height")
+    parser.add_argument("--width", type=int, default=1024, help="Image width")
+    parser.add_argument("--steps", type=int, default=50, help="num_inference_steps")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+
+    parser.add_argument("--lora-path", default=None, help="Server-local LoRA adapter folder (PEFT format)")
+    parser.add_argument("--lora-name", default=None, help="LoRA name (optional)")
+    parser.add_argument("--lora-scale", type=float, default=1.0, help="LoRA scale")
+    parser.add_argument(
+        "--lora-int-id",
+        type=int,
+        default=None,
+        help="LoRA integer id (cache key). If omitted, the server derives a stable id from lora_path.",
+    )
+
+    args = parser.parse_args()
+
+    image_bytes = generate_image(
+        prompt=args.prompt,
+        server_url=args.server,
+        height=args.height,
+        width=args.width,
+        num_inference_steps=args.steps,
+        seed=args.seed,
+        lora_name=args.lora_name,
+        lora_path=args.lora_path,
+        lora_scale=args.lora_scale if args.lora_path else None,
+        lora_int_id=args.lora_int_id if args.lora_path else None,
+    )
+
+    out_path = Path(args.output)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_bytes(image_bytes)
+    print(f"Saved: {out_path} ({len(image_bytes) / 1024:.1f} KiB)")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/lora_inference/run_curl_lora_inference.sh
+++ b/examples/online_serving/lora_inference/run_curl_lora_inference.sh
+#!/bin/bash
+# Online diffusion LoRA inference via OpenAI-compatible chat API.
+
+SERVER="${SERVER:-http://localhost:8091}"
+PROMPT="${PROMPT:-A piece of cheesecake}"
+
+LORA_PATH="${LORA_PATH:-}"
+LORA_NAME="${LORA_NAME:-lora}"
+LORA_SCALE="${LORA_SCALE:-1.0}"
+LORA_INT_ID="${LORA_INT_ID:-}"
+
+HEIGHT="${HEIGHT:-1024}"
+WIDTH="${WIDTH:-1024}"
+NUM_INFERENCE_STEPS="${NUM_INFERENCE_STEPS:-50}"
+SEED="${SEED:-42}"
+
+CURRENT_TIME=$(date +%Y%m%d%H%M%S)
+OUTPUT="${OUTPUT:-lora_online_output_${CURRENT_TIME}.png}"
+
+if [ -z "$LORA_PATH" ]; then
+  echo "ERROR: LORA_PATH is required (must be a server-local path)."
+  exit 1
+fi
+
+echo "Generating image with LoRA..."
+echo "Server: $SERVER"
+echo "Prompt: $PROMPT"
+echo "LoRA: name=$LORA_NAME id=${LORA_INT_ID:-auto} scale=$LORA_SCALE path=$LORA_PATH"
+echo "Output: $OUTPUT"
+
+LORA_INT_ID_FIELD=""
+if [ -n "$LORA_INT_ID" ]; then
+  LORA_INT_ID_FIELD=", \"int_id\": $LORA_INT_ID"
+fi
+
+curl -s "$SERVER/v1/chat/completions" \
+  -H "Content-Type: application/json" \
+  -d "{
+    \"messages\": [
+      {\"role\": \"user\", \"content\": \"$PROMPT\"}
+    ],
+    \"extra_body\": {
+      \"height\": $HEIGHT,
+      \"width\": $WIDTH,
+      \"num_inference_steps\": $NUM_INFERENCE_STEPS,
+      \"seed\": $SEED,
+      \"lora\": {
+        \"name\": \"$LORA_NAME\",
+        \"local_path\": \"$LORA_PATH\",
+        \"scale\": $LORA_SCALE$LORA_INT_ID_FIELD
+      }
+    }
+  }" | jq -r '.choices[0].message.content[0].image_url.url' | sed 's/^data:image[^,]*,\s*//' | base64 -d > "$OUTPUT"
+
+if [ -f "$OUTPUT" ]; then
+  echo "Image saved to: $OUTPUT"
+  echo "Size: $(du -h "$OUTPUT" | cut -f1)"
+else
+  echo "Failed to generate image"
+  exit 1
+fi
--- a/examples/online_serving/lora_inference/run_server.sh
+++ b/examples/online_serving/lora_inference/run_server.sh
+#!/bin/bash
+# Online diffusion serving with vLLM-Omni (OpenAI-compatible API).
+
+MODEL="${MODEL:-stabilityai/stable-diffusion-3.5-medium}"
+PORT="${PORT:-8091}"
+
+echo "Starting vLLM-Omni diffusion server..."
+echo "Model: $MODEL"
+echo "Port: $PORT"
+
+if [ -z "${VLLM_BIN:-}" ]; then
+  if command -v vllm-omni >/dev/null 2>&1; then
+    VLLM_BIN="vllm-omni"
+  else
+    VLLM_BIN="vllm"
+  fi
+fi
+
+"$VLLM_BIN" serve "$MODEL" --omni \
+  --port "$PORT"
--- a/examples/online_serving/qwen2_5_omni/README.md
+++ b/examples/online_serving/qwen2_5_omni/README.md
+# Qwen2.5-Omni
+
+## 🛠️ Installation
+
+Please refer to [README.md](../../../README.md)
+
+## Run examples (Qwen2.5-Omni)
+
+### Launch the Server
+
+```bash
+vllm serve Qwen/Qwen2.5-Omni-7B --omni --port 8091
+```
+
+If you have custom stage configs file, launch the server with command below
+```bash
+vllm serve Qwen/Qwen2.5-Omni-7B --omni --port 8091 --stage-configs-path /path/to/stage_configs_file
+```
+
+### Send Multi-modal Request
+
+Get into the example folder
+```bash
+cd examples/online_serving/qwen2_5_omni
+```
+
+#### Send request via python
+
+```bash
+python openai_chat_completion_client_for_multimodal_generation.py --query-type mixed_modalities
+```
+
+The Python client supports the following command-line arguments:
+
+- `--query-type` (or `-q`): Query type (default: `mixed_modalities`). Options: `mixed_modalities`, `use_audio_in_video`, `multi_audios`, `text`
+- `--video-path` (or `-v`): Path to local video file or URL. If not provided and query-type uses video, uses default video URL. Supports local file paths (automatically encoded to base64) or HTTP/HTTPS URLs. Example: `--video-path /path/to/video.mp4` or `--video-path https://example.com/video.mp4`
+- `--image-path` (or `-i`): Path to local image file or URL. If not provided and query-type uses image, uses default image URL. Supports local file paths (automatically encoded to base64) or HTTP/HTTPS URLs and common image formats: JPEG, PNG, GIF, WebP. Example: `--image-path /path/to/image.jpg` or `--image-path https://example.com/image.png`
+- `--audio-path` (or `-a`): Path to local audio file or URL. If not provided and query-type uses audio, uses default audio URL. Supports local file paths (automatically encoded to base64) or HTTP/HTTPS URLs and common audio formats: MP3, WAV, OGG, FLAC, M4A. Example: `--audio-path /path/to/audio.wav` or `--audio-path https://example.com/audio.mp3`
+- `--prompt` (or `-p`): Custom text prompt/question. If not provided, uses default prompt for the selected query type. Example: `--prompt "What are the main activities shown in this video?"`
+
+
+For example, to use mixed modalities with all local files:
+
+```bash
+python openai_chat_completion_client_for_multimodal_generation.py \
+    --query-type mixed_modalities \
+    --video-path /path/to/your/video.mp4 \
+    --image-path /path/to/your/image.jpg \
+    --audio-path /path/to/your/audio.wav \
+    --prompt "Analyze all the media content and provide a comprehensive summary."
+```
+
+####  Send request via curl
+
+```bash
+bash run_curl_multimodal_generation.sh mixed_modalities
+```
+
+## Modality control
+You can control output modalities to specify which types of output the model should generate. This is useful when you only need text output and want to skip audio generation stages for better performance.
+
+### Supported modalities
+
+| Modalities | Output |
+|------------|--------|
+| `["text"]` | Text only |
+| `["audio"]` | Text + Audio |
+| `["text", "audio"]` | Text + Audio |
+| Not specified | Text + Audio (default) |
+
+### Using curl
+
+#### Text only
+
+```bash
+curl http://localhost:8091/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "Qwen/Qwen2.5-Omni-7B",
+    "messages": [{"role": "user", "content": "Describe vLLM in brief."}],
+    "modalities": ["text"]
+  }'
+```
+
+#### Text + Audio
+
+```bash
+curl http://localhost:8091/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "Qwen/Qwen2.5-Omni-7B",
+    "messages": [{"role": "user", "content": "Describe vLLM in brief."}],
+    "modalities": ["audio"]
+  }'
+```
+
+### Using Python client
+
+```bash
+python openai_chat_completion_client_for_multimodal_generation.py \
+    --query-type mixed_modalities \
+    --modalities text
+```
+
+### Using OpenAI Python SDK
+
+#### Text only
+
+```python
+from openai import OpenAI
+
+client = OpenAI(base_url="http://localhost:8091/v1", api_key="EMPTY")
+
+response = client.chat.completions.create(
+    model="Qwen/Qwen2.5-Omni-7B",
+    messages=[{"role": "user", "content": "Describe vLLM in brief."}],
+    modalities=["text"]
+)
+print(response.choices[0].message.content)
+```
+
+#### Text + Audio
+
+```python
+from openai import OpenAI
+
+client = OpenAI(base_url="http://localhost:8091/v1", api_key="EMPTY")
+
+response = client.chat.completions.create(
+    model="Qwen/Qwen2.5-Omni-7B",
+    messages=[{"role": "user", "content": "Describe vLLM in brief."}],
+    modalities=["audio"]
+)
+# Response contains two choices: one with text, one with audio
+print(response.choices[0].message.content)  # Text response
+print(response.choices[1].message.audio)    # Audio response
+```
+
+## Streaming Output
+If you want to enable streaming output, please set the argument as below. The final output will be obtained just after generated by corresponding stage. Now we only support text streaming output. Other modalities can output normally.
+```bash
+python openai_chat_completion_client_for_multimodal_generation.py \
+    --query-type mixed_modalities \
+    --stream
+```
+
+## Run Local Web UI Demo
+
+This Web UI demo allows users to interact with the model through a web browser.
+
+### Running Gradio Demo
+
+The Gradio demo connects to a vLLM API server. You have two options:
+
+#### Option 1: One-step Launch Script (Recommended)
+
+The convenience script launches both the vLLM server and Gradio demo together:
+
+```bash
+./run_gradio_demo.sh --model Qwen/Qwen2.5-Omni-7B --server-port 8091 --gradio-port 7861
+```
+
+This script will:
+1. Start the vLLM server in the background
+2. Wait for the server to be ready
+3. Launch the Gradio demo
+4. Handle cleanup when you press Ctrl+C
+
+The script supports the following arguments:
+- `--model`: Model name/path (default: Qwen/Qwen2.5-Omni-7B)
+- `--server-port`: Port for vLLM server (default: 8091)
+- `--gradio-port`: Port for Gradio demo (default: 7861)
+- `--stage-configs-path`: Path to custom stage configs YAML file (optional)
+- `--server-host`: Host for vLLM server (default: 0.0.0.0)
+- `--gradio-ip`: IP for Gradio demo (default: 127.0.0.1)
+- `--share`: Share Gradio demo publicly (creates a public link)
+
+#### Option 2: Manual Launch (Two-Step Process)
+
+**Step 1: Launch the vLLM API server**
+
+```bash
+vllm serve Qwen/Qwen2.5-Omni-7B --omni --port 8091
+```
+
+If you have custom stage configs file:
+```bash
+vllm serve Qwen/Qwen2.5-Omni-7B --omni --port 8091 --stage-configs-path /path/to/stage_configs_file
+```
+
+**Step 2: Run the Gradio demo**
+
+In a separate terminal:
+
+```bash
+python gradio_demo.py --model Qwen/Qwen2.5-Omni-7B --api-base http://localhost:8091/v1 --port 7861
+```
+
+Then open `http://localhost:7861/` on your local browser to interact with the web UI.
+
+The gradio script supports the following arguments:
+
+- `--model`: Model name/path (should match the server model)
+- `--api-base`: Base URL for the vLLM API server (default: http://localhost:8091/v1)
+- `--ip`: Host/IP for Gradio server (default: 127.0.0.1)
+- `--port`: Port for Gradio server (default: 7861)
+- `--share`: Share the Gradio demo publicly (creates a public link)
+
+### FAQ
+
+If you encounter error about backend of librosa, try to install ffmpeg with command below.
+```
+sudo apt update
+sudo apt install ffmpeg
+```
--- a/examples/online_serving/qwen2_5_omni/gradio_demo.py
+++ b/examples/online_serving/qwen2_5_omni/gradio_demo.py
+import argparse
+import base64
+import io
+import os
+import random
+from pathlib import Path
+from typing import Any
+
+import gradio as gr
+import numpy as np
+import soundfile as sf
+import torch
+from openai import OpenAI
+from PIL import Image
+
+SEED = 42
+
+SUPPORTED_MODELS: dict[str, dict[str, Any]] = {
+    "Qwen/Qwen2.5-Omni-7B": {
+        "sampling_params": {
+            "thinker": {
+                "temperature": 0.0,
+                "top_p": 1.0,
+                "top_k": -1,
+                "max_tokens": 2048,
+                "seed": SEED,
+                "detokenize": True,
+                "repetition_penalty": 1.1,
+            },
+            "talker": {
+                "temperature": 0.9,
+                "top_p": 0.8,
+                "top_k": 40,
+                "max_tokens": 2048,
+                "seed": SEED,
+                "detokenize": True,
+                "repetition_penalty": 1.05,
+                "stop_token_ids": [8294],
+            },
+            "code2wav": {
+                "temperature": 0.0,
+                "top_p": 1.0,
+                "top_k": -1,
+                "max_tokens": 2048,
+                "seed": SEED,
+                "detokenize": True,
+                "repetition_penalty": 1.1,
+            },
+        },
+    },
+}
+# Ensure deterministic behavior across runs.
+random.seed(SEED)
+np.random.seed(SEED)
+torch.manual_seed(SEED)
+torch.cuda.manual_seed(SEED)
+torch.cuda.manual_seed_all(SEED)
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+os.environ["PYTHONHASHSEED"] = str(SEED)
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Gradio demo for Qwen2.5-Omni online inference.")
+    parser.add_argument(
+        "--model",
+        default="Qwen/Qwen2.5-Omni-7B",
+        help="Model name/path (should match the server model).",
+    )
+    parser.add_argument(
+        "--api-base",
+        default="http://localhost:8091/v1",
+        help="Base URL for the vLLM API server.",
+    )
+    parser.add_argument(
+        "--ip",
+        default="127.0.0.1",
+        help="Host/IP for gradio `launch`.",
+    )
+    parser.add_argument("--port", type=int, default=7861, help="Port for gradio `launch`.")
+    parser.add_argument("--share", action="store_true", help="Share the Gradio demo publicly.")
+    return parser.parse_args()
+
+
+def build_sampling_params_dict(seed: int, model_key: str) -> list[dict]:
+    """Build sampling params as dict for HTTP API mode."""
+    model_conf = SUPPORTED_MODELS.get(model_key)
+    if model_conf is None:
+        raise ValueError(f"Unsupported model '{model_key}'")
+
+    sampling_templates: dict[str, dict[str, Any]] = model_conf["sampling_params"]
+    sampling_params: list[dict] = []
+    for stage_name, template in sampling_templates.items():
+        params = dict(template)
+        params["seed"] = seed
+        sampling_params.append(params)
+    return sampling_params
+
+
+def image_to_base64_data_url(image: Image.Image) -> str:
+    """Convert PIL Image to base64 data URL."""
+    buffered = io.BytesIO()
+    # Convert to RGB if needed
+    if image.mode != "RGB":
+        image = image.convert("RGB")
+    image.save(buffered, format="JPEG")
+    img_bytes = buffered.getvalue()
+    img_b64 = base64.b64encode(img_bytes).decode("utf-8")
+    return f"data:image/jpeg;base64,{img_b64}"
+
+
+def audio_to_base64_data_url(audio_data: tuple[np.ndarray, int]) -> str:
+    """Convert audio (numpy array, sample_rate) to base64 data URL."""
+    audio_np, sample_rate = audio_data
+    # Convert to int16 format for WAV
+    if audio_np.dtype != np.int16:
+        # Normalize to [-1, 1] range if needed
+        if audio_np.dtype == np.float32 or audio_np.dtype == np.float64:
+            audio_np = np.clip(audio_np, -1.0, 1.0)
+            audio_np = (audio_np * 32767).astype(np.int16)
+        else:
+            audio_np = audio_np.astype(np.int16)
+
+    # Write to WAV bytes
+    buffered = io.BytesIO()
+    sf.write(buffered, audio_np, sample_rate, format="WAV")
+    wav_bytes = buffered.getvalue()
+    wav_b64 = base64.b64encode(wav_bytes).decode("utf-8")
+    return f"data:audio/wav;base64,{wav_b64}"
+
+
+def video_to_base64_data_url(video_file: str) -> str:
+    """Convert video file to base64 data URL."""
+    video_path = Path(video_file)
+    if not video_path.exists():
+        raise FileNotFoundError(f"Video file not found: {video_file}")
+
+    # Detect MIME type from extension
+    video_path_lower = str(video_path).lower()
+    if video_path_lower.endswith(".mp4"):
+        mime_type = "video/mp4"
+    elif video_path_lower.endswith(".webm"):
+        mime_type = "video/webm"
+    elif video_path_lower.endswith(".mov"):
+        mime_type = "video/quicktime"
+    elif video_path_lower.endswith(".avi"):
+        mime_type = "video/x-msvideo"
+    elif video_path_lower.endswith(".mkv"):
+        mime_type = "video/x-matroska"
+    else:
+        mime_type = "video/mp4"
+
+    with open(video_path, "rb") as f:
+        video_bytes = f.read()
+    video_b64 = base64.b64encode(video_bytes).decode("utf-8")
+    return f"data:{mime_type};base64,{video_b64}"
+
+
+def process_audio_file(
+    audio_file: Any | None,
+) -> tuple[np.ndarray, int] | None:
+    """Normalize Gradio audio input to (np.ndarray, sample_rate)."""
+    if audio_file is None:
+        return None
+
+    sample_rate: int | None = None
+    audio_np: np.ndarray | None = None
+
+    def _load_from_path(path_str: str) -> tuple[np.ndarray, int] | None:
+        if not path_str:
+            return None
+        path = Path(path_str)
+        if not path.exists():
+            return None
+        data, sr = sf.read(path)
+        if data.ndim > 1:
+            data = data[:, 0]
+        return data.astype(np.float32), int(sr)
+
+    if isinstance(audio_file, tuple):
+        if len(audio_file) == 2:
+            first, second = audio_file
+            # Case 1: (sample_rate, np.ndarray)
+            if isinstance(first, (int, float)) and isinstance(second, np.ndarray):
+                sample_rate = int(first)
+                audio_np = second
+            # Case 2: (filepath, (sample_rate, np.ndarray or list))
+            elif isinstance(first, str):
+                if isinstance(second, tuple) and len(second) == 2:
+                    sr_candidate, data_candidate = second
+                    if isinstance(sr_candidate, (int, float)) and isinstance(data_candidate, np.ndarray):
+                        sample_rate = int(sr_candidate)
+                        audio_np = data_candidate
+                if audio_np is None:
+                    loaded = _load_from_path(first)
+                    if loaded is not None:
+                        audio_np, sample_rate = loaded
+            # Case 3: (None, (sample_rate, np.ndarray))
+            elif first is None and isinstance(second, tuple) and len(second) == 2:
+                sr_candidate, data_candidate = second
+                if isinstance(sr_candidate, (int, float)) and isinstance(data_candidate, np.ndarray):
+                    sample_rate = int(sr_candidate)
+                    audio_np = data_candidate
+        elif len(audio_file) == 1 and isinstance(audio_file[0], str):
+            loaded = _load_from_path(audio_file[0])
+            if loaded is not None:
+                audio_np, sample_rate = loaded
+    elif isinstance(audio_file, str):
+        loaded = _load_from_path(audio_file)
+        if loaded is not None:
+            audio_np, sample_rate = loaded
+
+    if audio_np is None or sample_rate is None:
+        return None
+
+    if audio_np.ndim > 1:
+        audio_np = audio_np[:, 0]
+
+    return audio_np.astype(np.float32), sample_rate
+
+
+def process_image_file(image_file: Image.Image | None) -> Image.Image | None:
+    """Process image file from Gradio input.
+
+    Returns:
+        PIL Image in RGB mode or None if no image provided.
+    """
+    if image_file is None:
+        return None
+    # Convert to RGB if needed
+    if image_file.mode != "RGB":
+        image_file = image_file.convert("RGB")
+    return image_file
+
+
+def run_inference_api(
+    client: OpenAI,
+    model: str,
+    sampling_params_dict: list[dict],
+    user_prompt: str,
+    audio_file: tuple[str, tuple[int, np.ndarray]] | None = None,
+    image_file: Image.Image | None = None,
+    video_file: str | None = None,
+    use_audio_in_video: bool = False,
+    output_modalities: str | None = None,
+    stream: bool = False,
+):
+    """Run inference using OpenAI API client with multimodal support."""
+    if not user_prompt.strip() and not audio_file and not image_file and not video_file:
+        yield "Please provide at least a text prompt or multimodal input.", None
+        return
+
+    try:
+        # Build message content list
+        content_list = []
+
+        # Process audio
+        audio_data = process_audio_file(audio_file)
+        if audio_data is not None:
+            audio_url = audio_to_base64_data_url(audio_data)
+            content_list.append(
+                {
+                    "type": "audio_url",
+                    "audio_url": {"url": audio_url},
+                }
+            )
+
+        # Process image
+        if image_file is not None:
+            image_data = process_image_file(image_file)
+            if image_data is not None:
+                image_url = image_to_base64_data_url(image_data)
+                content_list.append(
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url},
+                    }
+                )
+
+        # Process video
+        mm_processor_kwargs = {}
+        if video_file is not None:
+            video_url = video_to_base64_data_url(video_file)
+            video_content = {
+                "type": "video_url",
+                "video_url": {"url": video_url},
+            }
+            if use_audio_in_video:
+                video_content["video_url"]["num_frames"] = 32  # Default max frames
+                mm_processor_kwargs["use_audio_in_video"] = True
+            content_list.append(video_content)
+
+        # Add text prompt
+        if user_prompt.strip():
+            content_list.append(
+                {
+                    "type": "text",
+                    "text": user_prompt,
+                }
+            )
+
+        # Build messages
+        messages = [
+            {
+                "role": "system",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": (
+                            "You are Qwen, a virtual human developed by the Qwen Team, "
+                            "Alibaba Group, capable of perceiving auditory and visual inputs, "
+                            "as well as generating text and speech."
+                        ),
+                    }
+                ],
+            },
+            {
+                "role": "user",
+                "content": content_list,
+            },
+        ]
+
+        # Build extra_body
+        extra_body = {
+            "sampling_params_list": sampling_params_dict,
+        }
+        if mm_processor_kwargs:
+            extra_body["mm_processor_kwargs"] = mm_processor_kwargs
+
+        # Parse output modalities
+        if output_modalities and output_modalities.strip():
+            output_modalities_list = [m.strip() for m in output_modalities.split(",")]
+        else:
+            output_modalities_list = None
+
+        # Call API
+        chat_completion = client.chat.completions.create(
+            messages=messages,
+            model=model,
+            modalities=output_modalities_list,
+            extra_body=extra_body,
+            stream=stream,
+        )
+
+        if not stream:
+            # Non-streaming mode: extract outputs and yield once
+            text_outputs: list[str] = []
+            audio_output = None
+
+            for choice in chat_completion.choices:
+                if choice.message.content:
+                    text_outputs.append(choice.message.content)
+                if choice.message.audio:
+                    # Decode base64 audio
+                    audio_data = base64.b64decode(choice.message.audio.data)
+                    # Load audio from bytes
+                    audio_np, sample_rate = sf.read(io.BytesIO(audio_data))
+                    # Convert to mono if needed
+                    if audio_np.ndim > 1:
+                        audio_np = audio_np[:, 0]
+                    audio_output = (int(sample_rate), audio_np.astype(np.float32))
+
+            text_response = "\n\n".join(text_outputs) if text_outputs else "No text output."
+            yield text_response, audio_output
+        else:
+            # Streaming mode: yield incremental updates
+            text_content = ""
+            audio_output = None
+
+            for chunk in chat_completion:
+                for choice in chunk.choices:
+                    if hasattr(choice, "delta"):
+                        content = getattr(choice.delta, "content", None)
+                    else:
+                        content = None
+
+                    # Handle audio modality
+                    if getattr(chunk, "modality", None) == "audio" and content:
+                        try:
+                            # Decode base64 audio
+                            audio_data = base64.b64decode(content)
+                            # Load audio from bytes
+                            audio_np, sample_rate = sf.read(io.BytesIO(audio_data))
+                            # Convert to mono if needed
+                            if audio_np.ndim > 1:
+                                audio_np = audio_np[:, 0]
+                            audio_output = (int(sample_rate), audio_np.astype(np.float32))
+                            # Yield current text and audio
+                            yield text_content if text_content else "", audio_output
+                        except Exception:  # pylint: disable=broad-except
+                            # If audio processing fails, just yield text
+                            yield text_content if text_content else "", None
+
+                    # Handle text modality
+                    elif getattr(chunk, "modality", None) == "text":
+                        if content:
+                            text_content += content
+                            # Yield updated text content (keep existing audio if any)
+                            yield text_content, audio_output
+
+            # Final yield with accumulated text and last audio (if any)
+            yield text_content if text_content else "No text output.", audio_output
+
+    except Exception as exc:  # pylint: disable=broad-except
+        error_msg = f"Inference failed: {exc}"
+        yield error_msg, None
+
+
+def build_interface(
+    client: OpenAI,
+    model: str,
+    sampling_params_dict: list[dict],
+):
+    """Build Gradio interface for API server mode."""
+
+    def run_inference(
+        user_prompt: str,
+        audio_file: tuple[str, tuple[int, np.ndarray]] | None,
+        image_file: Image.Image | None,
+        video_file: str | None,
+        use_audio_in_video: bool,
+        output_modalities: str | None = None,
+        stream: bool = False,
+    ):
+        # Always yield from the API function to maintain consistent generator behavior
+        yield from run_inference_api(
+            client,
+            model,
+            sampling_params_dict,
+            user_prompt,
+            audio_file,
+            image_file,
+            video_file,
+            use_audio_in_video,
+            output_modalities,
+            stream,
+        )
+
+    css = """
+    .media-input-container {
+        display: flex;
+        gap: 10px;
+    }
+    .media-input-container > div {
+        flex: 1;
+    }
+    .media-input-container .image-input,
+    .media-input-container .audio-input {
+        height: 300px;
+    }
+    .media-input-container .video-column {
+        height: 300px;
+        display: flex;
+        flex-direction: column;
+    }
+    .media-input-container .video-input {
+        flex: 1;
+        min-height: 0;
+    }
+    #generate-btn button {
+        width: 100%;
+    }
+    """
+
+    with gr.Blocks(css=css) as demo:
+        gr.Markdown("# vLLM-Omni Online Serving Demo")
+        gr.Markdown(f"**Model:** {model} \n\n")
+
+        with gr.Column():
+            with gr.Row():
+                input_box = gr.Textbox(
+                    label="Text Prompt",
+                    placeholder="For example: Describe what happens in the media inputs.",
+                    lines=4,
+                    scale=1,
+                )
+            with gr.Row(elem_classes="media-input-container"):
+                image_input = gr.Image(
+                    label="Image Input (optional)",
+                    type="pil",
+                    sources=["upload"],
+                    scale=1,
+                    elem_classes="image-input",
+                )
+                with gr.Column(scale=1, elem_classes="video-column"):
+                    video_input = gr.Video(
+                        label="Video Input (optional)",
+                        sources=["upload"],
+                        elem_classes="video-input",
+                    )
+                    use_audio_in_video_checkbox = gr.Checkbox(
+                        label="Use audio from video",
+                        value=False,
+                        info="Extract the video's audio track when provided.",
+                    )
+                audio_input = gr.Audio(
+                    label="Audio Input (optional)",
+                    type="numpy",
+                    sources=["upload", "microphone"],
+                    scale=1,
+                    elem_classes="audio-input",
+                )
+
+        with gr.Row():
+            output_modalities = gr.Textbox(
+                label="Output Modalities",
+                value=None,
+                placeholder="For example: text, image, video. Use comma to separate multiple modalities.",
+                lines=1,
+                scale=2,
+            )
+            stream_checkbox = gr.Checkbox(
+                label="Stream output",
+                value=False,
+                info="Enable streaming to see output as it's generated.",
+                scale=1,
+            )
+
+        with gr.Row():
+            generate_btn = gr.Button(
+                "Generate",
+                variant="primary",
+                size="lg",
+                elem_id="generate-btn",
+            )
+
+        with gr.Row():
+            text_output = gr.Textbox(label="Text Output", lines=10, scale=2)
+            audio_output = gr.Audio(label="Audio Output", interactive=False, scale=1)
+
+        generate_btn.click(
+            fn=run_inference,
+            inputs=[
+                input_box,
+                audio_input,
+                image_input,
+                video_input,
+                use_audio_in_video_checkbox,
+                output_modalities,
+                stream_checkbox,
+            ],
+            outputs=[text_output, audio_output],
+        )
+        demo.queue()
+    return demo
+
+
+def main():
+    args = parse_args()
+
+    model_name = "/".join(args.model.split("/")[-2:])
+    assert model_name in SUPPORTED_MODELS, (
+        f"Unsupported model '{model_name}'. Supported models: {SUPPORTED_MODELS.keys()}"
+    )
+
+    # Initialize OpenAI client
+    print(f"Connecting to API server at: {args.api_base}")
+    client = OpenAI(
+        api_key="EMPTY",
+        base_url=args.api_base,
+    )
+    print("✓ Connected to API server")
+
+    # Build sampling params
+    sampling_params_dict = build_sampling_params_dict(SEED, model_name)
+
+    demo = build_interface(
+        client,
+        args.model,
+        sampling_params_dict,
+    )
+    try:
+        demo.launch(
+            server_name=args.ip,
+            server_port=args.port,
+            share=args.share,
+        )
+    except KeyboardInterrupt:
+        print("\nShutting down...")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/qwen2_5_omni/openai_chat_completion_client_for_multimodal_generation.py
+++ b/examples/online_serving/qwen2_5_omni/openai_chat_completion_client_for_multimodal_generation.py
+import base64
+import os
+
+import requests
+from openai import OpenAI
+from vllm.assets.audio import AudioAsset
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8091/v1"
+
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+SEED = 42
+
+
+def encode_base64_content_from_url(content_url: str) -> str:
+    """Encode a content retrieved from a remote url to base64 format."""
+
+    with requests.get(content_url) as response:
+        response.raise_for_status()
+        result = base64.b64encode(response.content).decode("utf-8")
+
+    return result
+
+
+def encode_base64_content_from_file(file_path: str) -> str:
+    """Encode a local file to base64 format."""
+    with open(file_path, "rb") as f:
+        content = f.read()
+        result = base64.b64encode(content).decode("utf-8")
+    return result
+
+
+def get_video_url_from_path(video_path: str | None) -> str:
+    """Convert a video path (local file or URL) to a video URL format for the API.
+
+    If video_path is None or empty, returns the default URL.
+    If video_path is a local file path, encodes it to base64 data URL.
+    If video_path is a URL, returns it as-is.
+    """
+    if not video_path:
+        # Default video URL
+        return "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4"
+
+    # Check if it's a URL (starts with http:// or https://)
+    if video_path.startswith(("http://", "https://")):
+        return video_path
+
+    # Otherwise, treat it as a local file path
+    if not os.path.exists(video_path):
+        raise FileNotFoundError(f"Video file not found: {video_path}")
+
+    # Detect video MIME type from file extension
+    video_path_lower = video_path.lower()
+    if video_path_lower.endswith(".mp4"):
+        mime_type = "video/mp4"
+    elif video_path_lower.endswith(".webm"):
+        mime_type = "video/webm"
+    elif video_path_lower.endswith(".mov"):
+        mime_type = "video/quicktime"
+    elif video_path_lower.endswith(".avi"):
+        mime_type = "video/x-msvideo"
+    elif video_path_lower.endswith(".mkv"):
+        mime_type = "video/x-matroska"
+    else:
+        # Default to mp4 if extension is unknown
+        mime_type = "video/mp4"
+
+    video_base64 = encode_base64_content_from_file(video_path)
+    return f"data:{mime_type};base64,{video_base64}"
+
+
+def get_image_url_from_path(image_path: str | None) -> str:
+    """Convert an image path (local file or URL) to an image URL format for the API.
+
+    If image_path is None or empty, returns the default URL.
+    If image_path is a local file path, encodes it to base64 data URL.
+    If image_path is a URL, returns it as-is.
+    """
+    if not image_path:
+        # Default image URL
+        return "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/cherry_blossom.jpg"
+
+    # Check if it's a URL (starts with http:// or https://)
+    if image_path.startswith(("http://", "https://")):
+        return image_path
+
+    # Otherwise, treat it as a local file path
+    if not os.path.exists(image_path):
+        raise FileNotFoundError(f"Image file not found: {image_path}")
+
+    # Detect image MIME type from file extension
+    image_path_lower = image_path.lower()
+    if image_path_lower.endswith((".jpg", ".jpeg")):
+        mime_type = "image/jpeg"
+    elif image_path_lower.endswith(".png"):
+        mime_type = "image/png"
+    elif image_path_lower.endswith(".gif"):
+        mime_type = "image/gif"
+    elif image_path_lower.endswith(".webp"):
+        mime_type = "image/webp"
+    else:
+        # Default to jpeg if extension is unknown
+        mime_type = "image/jpeg"
+
+    image_base64 = encode_base64_content_from_file(image_path)
+    return f"data:{mime_type};base64,{image_base64}"
+
+
+def get_audio_url_from_path(audio_path: str | None) -> str:
+    """Convert an audio path (local file or URL) to an audio URL format for the API.
+
+    If audio_path is None or empty, returns the default URL.
+    If audio_path is a local file path, encodes it to base64 data URL.
+    If audio_path is a URL, returns it as-is.
+    """
+    if not audio_path:
+        # Default audio URL
+        return AudioAsset("mary_had_lamb").url
+
+    # Check if it's a URL (starts with http:// or https://)
+    if audio_path.startswith(("http://", "https://")):
+        return audio_path
+
+    # Otherwise, treat it as a local file path
+    if not os.path.exists(audio_path):
+        raise FileNotFoundError(f"Audio file not found: {audio_path}")
+
+    # Detect audio MIME type from file extension
+    audio_path_lower = audio_path.lower()
+    if audio_path_lower.endswith((".mp3", ".mpeg")):
+        mime_type = "audio/mpeg"
+    elif audio_path_lower.endswith(".wav"):
+        mime_type = "audio/wav"
+    elif audio_path_lower.endswith(".ogg"):
+        mime_type = "audio/ogg"
+    elif audio_path_lower.endswith(".flac"):
+        mime_type = "audio/flac"
+    elif audio_path_lower.endswith(".m4a"):
+        mime_type = "audio/mp4"
+    else:
+        # Default to wav if extension is unknown
+        mime_type = "audio/wav"
+
+    audio_base64 = encode_base64_content_from_file(audio_path)
+    return f"data:{mime_type};base64,{audio_base64}"
+
+
+def get_system_prompt():
+    return {
+        "role": "system",
+        "content": [
+            {
+                "type": "text",
+                "text": (
+                    "You are Qwen, a virtual human developed by the Qwen Team, "
+                    "Alibaba Group, capable of perceiving auditory and visual inputs, "
+                    "as well as generating text and speech."
+                ),
+            }
+        ],
+    }
+
+
+def get_text_query(custom_prompt: str | None = None):
+    question = (
+        custom_prompt or "Explain the system architecture for a scalable audio generation pipeline. Answer in 15 words."
+    )
+    prompt = {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": f"{question}",
+            }
+        ],
+    }
+    return prompt
+
+
+def get_mixed_modalities_query(
+    video_path: str | None = None,
+    image_path: str | None = None,
+    audio_path: str | None = None,
+    custom_prompt: str | None = None,
+):
+    question = (
+        custom_prompt or "What is recited in the audio? What is the content of this image? Why is this video funny?"
+    )
+    video_url = get_video_url_from_path(video_path)
+    image_url = get_image_url_from_path(image_path)
+    audio_url = get_audio_url_from_path(audio_path)
+    prompt = {
+        "role": "user",
+        "content": [
+            {
+                "type": "audio_url",
+                "audio_url": {"url": audio_url},
+            },
+            {
+                "type": "image_url",
+                "image_url": {"url": image_url},
+            },
+            {
+                "type": "video_url",
+                "video_url": {"url": video_url},
+            },
+            {
+                "type": "text",
+                "text": f"{question}",
+            },
+        ],
+    }
+
+    return prompt
+
+
+def get_use_audio_in_video_query(video_path: str | None = None, custom_prompt: str | None = None):
+    question = custom_prompt or "Describe the content of the video, then convert what the baby say into text."
+    video_url = get_video_url_from_path(video_path)
+
+    prompt = {
+        "role": "user",
+        "content": [
+            {
+                "type": "video_url",
+                "video_url": {
+                    "url": video_url,
+                    "num_frames": 16,
+                },
+            },
+            {
+                "type": "text",
+                "text": f"{question}",
+            },
+        ],
+    }
+
+    return prompt
+
+
+def get_multi_audios_query(audio_path: str | None = None, custom_prompt: str | None = None):
+    question = custom_prompt or "Are these two audio clips the same?"
+    audio_url = get_audio_url_from_path(audio_path)
+    prompt = {
+        "role": "user",
+        "content": [
+            {
+                "type": "audio_url",
+                "audio_url": {"url": audio_url},
+            },
+            {
+                "type": "audio_url",
+                "audio_url": {"url": AudioAsset("winning_call").url},
+            },
+            {
+                "type": "text",
+                "text": f"{question}",
+            },
+        ],
+    }
+    return prompt
+
+
+query_map = {
+    "mixed_modalities": get_mixed_modalities_query,
+    "use_audio_in_video": get_use_audio_in_video_query,
+    "multi_audios": get_multi_audios_query,
+    "text": get_text_query,
+}
+
+
+def run_multimodal_generation(args) -> None:
+    model_name = "Qwen/Qwen2.5-Omni-7B"
+    thinker_sampling_params = {
+        "temperature": 0.0,  # Deterministic - no randomness
+        "top_p": 1.0,  # Disable nucleus sampling
+        "top_k": -1,  # Disable top-k sampling
+        "max_tokens": 2048,
+        "seed": SEED,  # Fixed seed for sampling
+        "detokenize": True,
+        "repetition_penalty": 1.1,
+    }
+    talker_sampling_params = {
+        "temperature": 0.9,
+        "top_p": 0.8,
+        "top_k": 40,
+        "max_tokens": 2048,
+        "seed": SEED,  # Fixed seed for sampling
+        "detokenize": True,
+        "repetition_penalty": 1.05,
+        "stop_token_ids": [8294],
+    }
+    code2wav_sampling_params = {
+        "temperature": 0.0,  # Deterministic - no randomness
+        "top_p": 1.0,  # Disable nucleus sampling
+        "top_k": -1,  # Disable top-k sampling
+        "max_tokens": 2048,
+        "seed": SEED,  # Fixed seed for sampling
+        "detokenize": True,
+        "repetition_penalty": 1.1,
+    }
+
+    sampling_params_list = [
+        thinker_sampling_params,
+        talker_sampling_params,
+        code2wav_sampling_params,
+    ]
+
+    # Get paths and custom prompt from args
+    video_path = getattr(args, "video_path", None)
+    image_path = getattr(args, "image_path", None)
+    audio_path = getattr(args, "audio_path", None)
+    custom_prompt = getattr(args, "prompt", None)
+
+    # Get the query function and call it with appropriate parameters
+    query_func = query_map[args.query_type]
+    if args.query_type == "mixed_modalities":
+        prompt = query_func(
+            video_path=video_path, image_path=image_path, audio_path=audio_path, custom_prompt=custom_prompt
+        )
+    elif args.query_type == "use_audio_in_video":
+        prompt = query_func(video_path=video_path, custom_prompt=custom_prompt)
+    elif args.query_type == "multi_audios":
+        prompt = query_func(audio_path=audio_path, custom_prompt=custom_prompt)
+    elif args.query_type == "text":
+        prompt = query_func(custom_prompt=custom_prompt)
+    else:
+        prompt = query_func()
+
+    extra_body = {
+        "sampling_params_list": sampling_params_list  # Optional, it has a default setting in stage_configs of the corresponding model.
+    }
+
+    if args.query_type == "use_audio_in_video":
+        extra_body["mm_processor_kwargs"] = {"use_audio_in_video": True}
+
+    if args.modalities is not None:
+        output_modalities = args.modalities.split(",")
+    else:
+        output_modalities = None
+
+    chat_completion = client.chat.completions.create(
+        messages=[
+            get_system_prompt(),
+            prompt,
+        ],
+        model=model_name,
+        modalities=output_modalities,
+        extra_body=extra_body,
+        stream=args.stream,
+    )
+
+    count = 0
+    if not args.stream:
+        for choice in chat_completion.choices:
+            if choice.message.audio:
+                audio_data = base64.b64decode(choice.message.audio.data)
+                audio_file_path = f"audio_{count}.wav"
+                with open(audio_file_path, "wb") as f:
+                    f.write(audio_data)
+                print(f"Audio saved to {audio_file_path}")
+                count += 1
+            elif choice.message.content:
+                print("Chat completion output from text:", choice.message.content)
+    else:
+        printed_content = False
+        for chunk in chat_completion:
+            for choice in chunk.choices:
+                if hasattr(choice, "delta"):
+                    content = getattr(choice.delta, "content", None)
+                else:
+                    content = None
+
+                if getattr(chunk, "modality", None) == "audio" and content:
+                    audio_data = base64.b64decode(content)
+                    audio_file_path = f"audio_{count}.wav"
+                    with open(audio_file_path, "wb") as f:
+                        f.write(audio_data)
+                    print(f"\nAudio saved to {audio_file_path}")
+                    count += 1
+
+                elif getattr(chunk, "modality", None) == "text":
+                    if not printed_content:
+                        printed_content = True
+                        print("\ncontent:", end="", flush=True)
+                    print(content, end="", flush=True)
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(description="Demo on using vLLM for offline inference with audio language models")
+    parser.add_argument(
+        "--query-type",
+        "-q",
+        type=str,
+        default="mixed_modalities",
+        choices=query_map.keys(),
+        help="Query type.",
+    )
+    parser.add_argument(
+        "--video-path",
+        "-v",
+        type=str,
+        default=None,
+        help="Path to local video file or URL. If not provided and query-type uses video, uses default video URL.",
+    )
+    parser.add_argument(
+        "--image-path",
+        "-i",
+        type=str,
+        default=None,
+        help="Path to local image file or URL. If not provided and query-type uses image, uses default image URL.",
+    )
+    parser.add_argument(
+        "--audio-path",
+        "-a",
+        type=str,
+        default=None,
+        help="Path to local audio file or URL. If not provided and query-type uses audio, uses default audio URL.",
+    )
+    parser.add_argument(
+        "--prompt",
+        "-p",
+        type=str,
+        default=None,
+        help="Custom text prompt/question to use instead of the default prompt for the selected query type.",
+    )
+    parser.add_argument(
+        "--modalities",
+        type=str,
+        default=None,
+        help="Output modalities to use for the prompts.",
+    )
+    parser.add_argument(
+        "--stream",
+        action="store_true",
+        help="Stream the response.",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    run_multimodal_generation(args)
--- a/examples/online_serving/qwen2_5_omni/run_curl_multimodal_generation.sh
+++ b/examples/online_serving/qwen2_5_omni/run_curl_multimodal_generation.sh
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Default query type
+QUERY_TYPE="${1:-mixed_modalities}"
+
+# Default modalities argument
+MODALITIES="${2:-null}"
+
+# Validate query type
+if [[ ! "$QUERY_TYPE" =~ ^(mixed_modalities|use_audio_in_video|multi_audios|text)$ ]]; then
+    echo "Error: Invalid query type '$QUERY_TYPE'"
+    echo "Usage: $0 [mixed_modalities|use_audio_in_video|multi_audios|text] [modalities]"
+    echo "  mixed_modalities: Audio + Image + Video + Text query"
+    echo "  use_audio_in_video: Video + Text query (with audio extraction from video)"
+    echo "  multi_audios: Two audio clips + Text query"
+    echo "  text: Text query"
+    echo "  modalities: Modalities parameter (default: null)"
+    exit 1
+fi
+
+SEED=42
+
+thinker_sampling_params='{
+  "temperature": 0.0,
+  "top_p": 1.0,
+  "top_k": -1,
+  "max_tokens": 2048,
+  "seed": 42,
+  "detokenize": true,
+  "repetition_penalty": 1.1
+}'
+
+talker_sampling_params='{
+  "temperature": 0.9,
+  "top_p": 0.8,
+  "top_k": 40,
+  "max_tokens": 2048,
+  "seed": 42,
+  "detokenize": true,
+  "repetition_penalty": 1.05,
+  "stop_token_ids": [8294]
+}'
+
+code2wav_sampling_params='{
+  "temperature": 0.0,
+  "top_p": 1.0,
+  "top_k": -1,
+  "max_tokens": 2048,
+  "seed": 42,
+  "detokenize": true,
+  "repetition_penalty": 1.1
+}'
+# Above is optional, it has a default setting in stage_configs of the corresponding model.
+
+# Define URLs for assets
+MARY_HAD_LAMB_AUDIO_URL="https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/mary_had_lamb.ogg"
+WINNING_CALL_AUDIO_URL="https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/winning_call.ogg"
+CHERRY_BLOSSOM_IMAGE_URL="https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/cherry_blossom.jpg"
+SAMPLE_VIDEO_URL="https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4"
+
+# Build user content and extra fields based on query type
+case "$QUERY_TYPE" in
+  text)
+    user_content='[
+      {
+        "type": "text",
+        "text": "Explain the system architecture for a scalable audio generation pipeline. Answer in 15 words."
+      }
+    ]'
+    sampling_params_list='[
+      '"$thinker_sampling_params"',
+      '"$talker_sampling_params"',
+      '"$code2wav_sampling_params"'
+    ]'
+    mm_processor_kwargs="{}"
+    ;;
+  mixed_modalities)
+    user_content='[
+        {
+          "type": "audio_url",
+          "audio_url": {
+            "url": "'"$MARY_HAD_LAMB_AUDIO_URL"'"
+          }
+        },
+        {
+          "type": "image_url",
+          "image_url": {
+            "url": "'"$CHERRY_BLOSSOM_IMAGE_URL"'"
+          }
+        },
+        {
+          "type": "video_url",
+          "video_url": {
+            "url": "'"$SAMPLE_VIDEO_URL"'"
+          }
+        },
+        {
+          "type": "text",
+          "text": "What is recited in the audio? What is the content of this image? Why is this video funny?"
+        }
+      ]'
+    sampling_params_list='[
+      '"$thinker_sampling_params"',
+      '"$talker_sampling_params"',
+      '"$code2wav_sampling_params"'
+    ]'
+    mm_processor_kwargs="{}"
+    ;;
+  use_audio_in_video)
+    user_content='[
+        {
+          "type": "video_url",
+          "video_url": {
+            "url": "'"$SAMPLE_VIDEO_URL"'"
+          }
+        },
+        {
+          "type": "text",
+          "text": "Describe the content of the video, then convert what the baby say into text."
+        }
+      ]'
+    sampling_params_list='[
+      '"$thinker_sampling_params"',
+      '"$talker_sampling_params"',
+      '"$code2wav_sampling_params"'
+    ]'
+    mm_processor_kwargs='{
+      "use_audio_in_video": true
+    }'
+    ;;
+  multi_audios)
+    user_content='[
+        {
+          "type": "audio_url",
+          "audio_url": {
+            "url": "'"$MARY_HAD_LAMB_AUDIO_URL"'"
+          }
+        },
+        {
+          "type": "audio_url",
+          "audio_url": {
+            "url": "'"$WINNING_CALL_AUDIO_URL"'"
+          }
+        },
+        {
+          "type": "text",
+          "text": "Are these two audio clips the same?"
+        }
+      ]'
+    sampling_params_list='[
+      '"$thinker_sampling_params"',
+      '"$talker_sampling_params"',
+      '"$code2wav_sampling_params"'
+    ]'
+    mm_processor_kwargs="{}"
+    ;;
+esac
+
+echo "Running query type: $QUERY_TYPE"
+echo ""
+
+
+output=$(curl -sS -X POST http://localhost:8091/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d @- <<EOF
+{
+  "model": "Qwen/Qwen2.5-Omni-7B",
+  "sampling_params_list": $sampling_params_list,
+  "mm_processor_kwargs": $mm_processor_kwargs,
+  "modalities": $MODALITIES,
+  "messages": [
+    {
+      "role": "system",
+      "content": [
+        {
+          "type": "text",
+          "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."
+        }
+      ]
+    },
+    {
+      "role": "user",
+      "content": $user_content
+    }
+  ]
+}
+EOF
+  )
+
+# Here it only shows the text content of the first choice. Audio content has many binaries, so it's not displayed here.
+echo "Output of request: $(echo "$output" | jq '.choices[0].message.content')"
--- a/examples/online_serving/qwen2_5_omni/run_gradio_demo.sh
+++ b/examples/online_serving/qwen2_5_omni/run_gradio_demo.sh
+#!/bin/bash
+# Convenience script to launch both vLLM server and Gradio demo for Qwen2.5-Omni
+#
+# Usage:
+#   ./run_gradio_demo.sh [OPTIONS]
+#
+# Example:
+#   ./run_gradio_demo.sh --model Qwen/Qwen2.5-Omni-7B --server-port 8091 --gradio-port 7861
+
+set -e
+
+# Default values
+MODEL="Qwen/Qwen2.5-Omni-7B"
+SERVER_PORT=8091
+GRADIO_PORT=7861
+STAGE_CONFIGS_PATH=""
+SERVER_HOST="0.0.0.0"
+GRADIO_IP="127.0.0.1"
+GRADIO_SHARE=false
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL="$2"
+            shift 2
+            ;;
+        --server-port)
+            SERVER_PORT="$2"
+            shift 2
+            ;;
+        --gradio-port)
+            GRADIO_PORT="$2"
+            shift 2
+            ;;
+        --stage-configs-path)
+            STAGE_CONFIGS_PATH="$2"
+            shift 2
+            ;;
+        --server-host)
+            SERVER_HOST="$2"
+            shift 2
+            ;;
+        --gradio-ip)
+            GRADIO_IP="$2"
+            shift 2
+            ;;
+        --share)
+            GRADIO_SHARE=true
+            shift
+            ;;
+        --help)
+            echo "Usage: $0 [OPTIONS]"
+            echo ""
+            echo "Options:"
+            echo "  --model MODEL                 Model name/path (default: Qwen/Qwen2.5-Omni-7B)"
+            echo "  --server-port PORT            Port for vLLM server (default: 8091)"
+            echo "  --gradio-port PORT            Port for Gradio demo (default: 7861)"
+            echo "  --stage-configs-path PATH     Path to custom stage configs YAML file (optional)"
+            echo "  --server-host HOST            Host for vLLM server (default: 0.0.0.0)"
+            echo "  --gradio-ip IP                IP for Gradio demo (default: 127.0.0.1)"
+            echo "  --share                       Share Gradio demo publicly"
+            echo "  --help                        Show this help message"
+            echo ""
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+# Get the directory where this script is located
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+API_BASE="http://localhost:${SERVER_PORT}/v1"
+HEALTH_URL="http://localhost:${SERVER_PORT}/health"
+
+echo "=========================================="
+echo "Starting vLLM-Omni Gradio Demo"
+echo "=========================================="
+echo "Model: $MODEL"
+echo "Server: http://${SERVER_HOST}:${SERVER_PORT}"
+echo "Gradio: http://${GRADIO_IP}:${GRADIO_PORT}"
+echo "=========================================="
+
+# Build vLLM server command
+SERVER_CMD=("vllm" "serve" "$MODEL" "--omni" "--port" "$SERVER_PORT" "--host" "$SERVER_HOST")
+if [ -n "$STAGE_CONFIGS_PATH" ]; then
+    SERVER_CMD+=("--stage-configs-path" "$STAGE_CONFIGS_PATH")
+fi
+
+# Function to cleanup on exit
+cleanup() {
+    echo ""
+    echo "Shutting down..."
+    if [ -n "$SERVER_PID" ]; then
+        echo "Stopping vLLM server (PID: $SERVER_PID)..."
+        kill "$SERVER_PID" 2>/dev/null || true
+        wait "$SERVER_PID" 2>/dev/null || true
+    fi
+    if [ -n "$GRADIO_PID" ]; then
+        echo "Stopping Gradio demo (PID: $GRADIO_PID)..."
+        kill "$GRADIO_PID" 2>/dev/null || true
+        wait "$GRADIO_PID" 2>/dev/null || true
+    fi
+    echo "Cleanup complete"
+    exit 0
+}
+
+# Set up signal handlers
+trap cleanup SIGINT SIGTERM
+
+# Start vLLM server with output shown in real-time and saved to log
+echo ""
+echo "Starting vLLM server..."
+LOG_FILE="/tmp/vllm_server_${SERVER_PORT}.log"
+"${SERVER_CMD[@]}" 2>&1 | tee "$LOG_FILE" &
+SERVER_PID=$!
+
+# Start a background process to monitor the log for startup completion
+STARTUP_COMPLETE=false
+TAIL_PID=""
+
+# Function to cleanup tail process
+cleanup_tail() {
+    if [ -n "$TAIL_PID" ]; then
+        kill "$TAIL_PID" 2>/dev/null || true
+        wait "$TAIL_PID" 2>/dev/null || true
+    fi
+}
+
+# Wait for server to be ready by checking log output
+echo ""
+echo "Waiting for vLLM server to be ready (checking for 'Application startup complete' message)..."
+echo ""
+
+# Monitor log file for startup completion message
+MAX_WAIT=300  # 5 minutes timeout as fallback
+ELAPSED=0
+
+# Use a temporary file to track startup completion
+STARTUP_FLAG="/tmp/vllm_startup_flag_${SERVER_PORT}.tmp"
+rm -f "$STARTUP_FLAG"
+
+# Start monitoring in background
+(
+    tail -f "$LOG_FILE" 2>/dev/null | grep -m 1 "Application startup complete" > /dev/null && touch "$STARTUP_FLAG"
+) &
+TAIL_PID=$!
+
+while [ $ELAPSED -lt $MAX_WAIT ]; do
+    # Check if startup flag file exists (startup complete)
+    if [ -f "$STARTUP_FLAG" ]; then
+        cleanup_tail
+        echo ""
+        echo "✓ vLLM server is ready!"
+        STARTUP_COMPLETE=true
+        break
+    fi
+
+    # Check if server process is still running
+    if ! kill -0 "$SERVER_PID" 2>/dev/null; then
+        cleanup_tail
+        echo ""
+        echo "Error: vLLM server failed to start (process terminated)"
+        wait "$SERVER_PID" 2>/dev/null || true
+        exit 1
+    fi
+
+    sleep 1
+    ELAPSED=$((ELAPSED + 1))
+done
+
+cleanup_tail
+rm -f "$STARTUP_FLAG"
+
+if [ "$STARTUP_COMPLETE" != "true" ]; then
+    echo ""
+    echo "Error: vLLM server did not complete startup within ${MAX_WAIT} seconds"
+    kill "$SERVER_PID" 2>/dev/null || true
+    exit 1
+fi
+
+# Start Gradio demo
+echo ""
+echo "Starting Gradio demo..."
+cd "$SCRIPT_DIR"
+GRADIO_CMD=("python" "gradio_demo.py" "--model" "$MODEL" "--api-base" "$API_BASE" "--ip" "$GRADIO_IP" "--port" "$GRADIO_PORT")
+if [ "$GRADIO_SHARE" = true ]; then
+    GRADIO_CMD+=("--share")
+fi
+
+"${GRADIO_CMD[@]}" > /tmp/gradio_demo.log 2>&1 &
+GRADIO_PID=$!
+
+echo ""
+echo "=========================================="
+echo "Both services are running!"
+echo "=========================================="
+echo "vLLM Server: http://${SERVER_HOST}:${SERVER_PORT}"
+echo "Gradio Demo: http://${GRADIO_IP}:${GRADIO_PORT}"
+echo ""
+echo "Press Ctrl+C to stop both services"
+echo "=========================================="
+echo ""
+
+# Wait for either process to exit
+wait $SERVER_PID $GRADIO_PID || true
+
+cleanup