vllm-omni_0.15.0.rc1+fix1 first commit

c1cacde6 · weishb · 35607782 · c1cacde6 · c1cacde6 · c1cacde6
Commit c1cacde6 authored Mar 25, 2026 by weishb
20 changed files
--- a/.buildkite/bootstrap-amd-omni.sh
+++ b/.buildkite/bootstrap-amd-omni.sh
+#!/bin/bash
+# vllm-omni customized version
+# Based on: https://github.com/vllm-project/ci-infra/blob/main/buildkite/bootstrap-amd.sh
+# Last synced: 2025-12-15
+# Modifications: Use local template file instead of downloading from ci-infra
+set -euo pipefail
+if [[ -z "${RUN_ALL:-}" ]]; then
+    RUN_ALL=0
+fi
+if [[ -z "${NIGHTLY:-}" ]]; then
+    NIGHTLY=0
+fi
+if [[ -z "${VLLM_CI_BRANCH:-}" ]]; then
+    VLLM_CI_BRANCH="main"
+fi
+if [[ -z "${AMD_MIRROR_HW:-}" ]]; then
+    AMD_MIRROR_HW="amdproduction"
+fi
+if [[ -z "${DOCS_ONLY_DISABLE:-}" ]]; then
+    DOCS_ONLY_DISABLE=0
+fi
+fail_fast() {
+    DISABLE_LABEL="ci-no-fail-fast"
+    # If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
+    if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
+        PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm-omni/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
+        if [[ $PR_LABELS == *"$DISABLE_LABEL"* ]]; then
+            echo false
+        else
+            echo true
+        fi
+    else
+        echo false  # not a PR or BUILDKITE_PULL_REQUEST not set
+    fi
+}
+check_run_all_label() {
+    RUN_ALL_LABEL="ready-run-all-tests"
+    # If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
+    if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
+        PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm-omni/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
+        if [[ $PR_LABELS == *"$RUN_ALL_LABEL"* ]]; then
+            echo true
+        else
+            echo false
+        fi
+    else
+        echo false  # not a PR or BUILDKITE_PULL_REQUEST not set
+    fi
+}
+if [[ -z "${COV_ENABLED:-}" ]]; then
+    COV_ENABLED=0
+fi
+upload_pipeline() {
+    echo "Uploading pipeline..."
+    # Install minijinja
+    ls .buildkite || buildkite-agent annotate --style error 'Please merge upstream main branch for buildkite CI'
+    curl -sSfL https://github.com/mitsuhiko/minijinja/releases/download/2.3.1/minijinja-cli-installer.sh | sh
+    source /var/lib/buildkite-agent/.cargo/env
+    if [[ $BUILDKITE_PIPELINE_SLUG == "fastcheck" ]]; then
+        AMD_MIRROR_HW="amdtentative"
+    fi
+    # Use local template file for vllm-omni
+    cp .buildkite/test-template-amd-omni.j2 .buildkite/test-template.j2
+    # (WIP) Use pipeline generator instead of jinja template
+    if [ -e ".buildkite/pipeline_generator/pipeline_generator.py" ]; then
+        python -m pip install click pydantic
+        python .buildkite/pipeline_generator/pipeline_generator.py --run_all=$RUN_ALL --list_file_diff="$LIST_FILE_DIFF" --nightly="$NIGHTLY" --mirror_hw="$AMD_MIRROR_HW"
+        buildkite-agent pipeline upload .buildkite/pipeline.yaml
+        exit 0
+    fi
+    echo "List file diff: $LIST_FILE_DIFF"
+    echo "Run all: $RUN_ALL"
+    echo "Nightly: $NIGHTLY"
+    echo "AMD Mirror HW: $AMD_MIRROR_HW"
+    FAIL_FAST=$(fail_fast)
+    cd .buildkite
+    (
+        set -x
+        # Output pipeline.yaml with all blank lines removed
+        minijinja-cli test-template.j2 test-amd.yaml \
+            -D branch="$BUILDKITE_BRANCH" \
+            -D list_file_diff="$LIST_FILE_DIFF" \
+            -D run_all="$RUN_ALL" \
+            -D nightly="$NIGHTLY" \
+            -D mirror_hw="$AMD_MIRROR_HW" \
+            -D fail_fast="$FAIL_FAST" \
+            -D vllm_use_precompiled="$VLLM_USE_PRECOMPILED" \
+            -D vllm_merge_base_commit="$(git merge-base origin/main HEAD)" \
+            -D cov_enabled="$COV_ENABLED" \
+            -D vllm_ci_branch="$VLLM_CI_BRANCH" \
+            | sed '/^[[:space:]]*$/d' \
+            > pipeline.yaml
+    )
+    cat pipeline.yaml
+    buildkite-agent artifact upload pipeline.yaml
+    buildkite-agent pipeline upload pipeline.yaml
+    exit 0
+}
+get_diff() {
+    $(git add .)
+    echo $(git diff --name-only --diff-filter=ACMDR $(git merge-base origin/main HEAD))
+}
+get_diff_main() {
+    $(git add .)
+    echo $(git diff --name-only --diff-filter=ACMDR HEAD~1)
+}
+file_diff=$(get_diff)
+if [[ $BUILDKITE_BRANCH == "main" ]]; then
+    file_diff=$(get_diff_main)
+fi
+# ----------------------------------------------------------------------
+# Early exit start: skip pipeline if conditions are met
+# ----------------------------------------------------------------------
+# skip pipeline if all changed files are under docs/
+if [[ "${DOCS_ONLY_DISABLE}" != "1" ]]; then
+  if [[ -n "${file_diff:-}" ]]; then
+    docs_only=1
+    # Robust iteration over newline-separated file_diff
+    while IFS= read -r f; do
+      [[ -z "$f" ]] && continue
+      # **Policy:** only skip if *every* path starts with docs/
+      if [[ "$f" != docs/* ]]; then
+        docs_only=0
+        break
+      fi
+    done < <(printf '%s\n' "$file_diff" | tr ' ' '\n' | tr -d '\r')
+    if [[ "$docs_only" -eq 1 ]]; then
+      buildkite-agent annotate ":memo: CI skipped — docs/** only changes detected
+\`\`\`
+${file_diff}
+\`\`\`" --style "info" || true
+      echo "[docs-only] All changes are under docs/. Exiting before pipeline upload."
+      exit 0
+    fi
+  fi
+fi
+# ----------------------------------------------------------------------
+# Early exit end
+# ----------------------------------------------------------------------
+patterns=(
+    "docker/Dockerfile"
+    "CMakeLists.txt"
+    "requirements/common.txt"
+    "requirements/cuda.txt"
+    "requirements/build.txt"
+    "requirements/test.txt"
+    "setup.py"
+    "csrc/"
+    "cmake/"
+)
+ignore_patterns=(
+    "docker/Dockerfile."
+    "csrc/cpu"
+    "csrc/rocm"
+    "cmake/hipify.py"
+    "cmake/cpu_extension.cmake"
+)
+for file in $file_diff; do
+    # First check if file matches any pattern
+    matches_pattern=0
+    for pattern in "${patterns[@]}"; do
+        if [[ $file == $pattern* ]] || [[ $file == $pattern ]]; then
+            matches_pattern=1
+            break
+        fi
+    done
+    # If file matches pattern, check it's not in ignore patterns
+    if [[ $matches_pattern -eq 1 ]]; then
+        matches_ignore=0
+        for ignore in "${ignore_patterns[@]}"; do
+            if [[ $file == $ignore* ]] || [[ $file == $ignore ]]; then
+                matches_ignore=1
+                break
+            fi
+        done
+        if [[ $matches_ignore -eq 0 ]]; then
+            RUN_ALL=1
+            echo "Found changes: $file. Run all tests"
+            break
+        fi
+    fi
+done
+# Check for ready-run-all-tests label
+LABEL_RUN_ALL=$(check_run_all_label)
+if [[ $LABEL_RUN_ALL == true ]]; then
+    RUN_ALL=1
+    NIGHTLY=1
+    echo "Found 'ready-run-all-tests' label. Running all tests including optional tests."
+fi
+# Decide whether to use precompiled wheels
+# Relies on existing patterns array as a basis.
+if [[ -n "${VLLM_USE_PRECOMPILED:-}" ]]; then
+    echo "VLLM_USE_PRECOMPILED is already set to: $VLLM_USE_PRECOMPILED"
+elif [[ $RUN_ALL -eq 1 ]]; then
+    export VLLM_USE_PRECOMPILED=0
+    echo "Detected critical changes, building wheels from source"
+else
+    export VLLM_USE_PRECOMPILED=1
+    echo "No critical changes, using precompiled wheels"
+fi
+LIST_FILE_DIFF=$(get_diff | tr ' ' '|')
+if [[ $BUILDKITE_BRANCH == "main" ]]; then
+    LIST_FILE_DIFF=$(get_diff_main | tr ' ' '|')
+fi
+upload_pipeline
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
+steps:
+  - label: ":docker: Build image"
+    key: image-build
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "docker build --file docker/Dockerfile.ci -t vllm-omni-ci ."
+      - "docker tag vllm-omni-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT"
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT"
+    agents:
+      queue: "cpu_queue_premerge"
+  # - label: "Test on NPU"
+  #   depends_on: ~
+  #   key: npu-test
+  #   commands:
+  #     - ".buildkite/scripts/hardware_ci/run_npu_test.sh"
+  #   agents:
+  #     queue: "ascend"
+  - label: "Simple Unit Test"
+    depends_on: image-build
+    commands:
+      - pytest -v -s tests/entrypoints/
+      - pytest -v -s tests/diffusion/cache/
+      - pytest -v -s tests/diffusion/lora/
+      - pytest -v -s tests/model_executor/models/qwen2_5_omni/test_audio_length.py
+      - pytest -v -s tests/worker/
+      - pytest -v -s tests/distributed/omni_connectors/test_kv_flow.py
+    agents:
+      queue: "gpu_1_queue"
+    plugins:
+      - docker#v5.2.0:
+          image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+          always-pull: true
+          propagate-environment: true
+          environment:
+            - "HF_HOME=/fsx/hf_cache"
+          volumes:
+            - "/fsx/hf_cache:/fsx/hf_cache"
+  - label: "Diffusion Model Test"
+    timeout_in_minutes: 20
+    depends_on: image-build
+    commands:
+      - pytest -s -v tests/e2e/offline_inference/test_t2i_model.py
+    agents:
+      queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
+    plugins:
+      - docker#v5.2.0:
+          image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+          always-pull: true
+          propagate-environment: true
+          environment:
+            - "HF_HOME=/fsx/hf_cache"
+          volumes:
+            - "/fsx/hf_cache:/fsx/hf_cache"
+  - label: "Diffusion Images API LoRA E2E"
+    timeout_in_minutes: 20
+    depends_on: image-build
+    commands:
+      - pytest -s -v tests/e2e/online_serving/test_images_generations_lora.py
+    agents:
+      queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
+    plugins:
+      - docker#v5.2.0:
+          image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+          always-pull: true
+          propagate-environment: true
+          environment:
+            - "HF_HOME=/fsx/hf_cache"
+          volumes:
+            - "/fsx/hf_cache:/fsx/hf_cache"
+  - label: "Diffusion Model CPU offloading Test"
+    timeout_in_minutes: 20
+    depends_on: image-build
+    commands:
+      - pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
+      - pytest -s -v tests/e2e/offline_inference/test_diffusion_layerwise_offload.py
+    agents:
+      queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
+    plugins:
+      - docker#v5.2.0:
+          image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+          always-pull: true
+          propagate-environment: true
+          environment:
+            - "HF_HOME=/fsx/hf_cache"
+          volumes:
+            - "/fsx/hf_cache:/fsx/hf_cache"
+  - label: "Audio Generation Model Test"
+    timeout_in_minutes: 20
+    depends_on: image-build
+    commands:
+      - pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py
+    agents:
+      queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
+    plugins:
+      - docker#v5.2.0:
+          image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+          always-pull: true
+          propagate-environment: true
+          environment:
+            - "HF_HOME=/fsx/hf_cache"
+          volumes:
+            - "/fsx/hf_cache:/fsx/hf_cache"
+  - label: "Diffusion Cache Backend Test"
+    timeout_in_minutes: 15
+    depends_on: image-build
+    commands:
+      - pytest -s -v tests/e2e/offline_inference/test_cache_dit.py tests/e2e/offline_inference/test_teacache.py
+    agents:
+      queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
+    plugins:
+      - docker#v5.2.0:
+          image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+          always-pull: true
+          propagate-environment: true
+          environment:
+            - "HF_HOME=/fsx/hf_cache"
+          volumes:
+            - "/fsx/hf_cache:/fsx/hf_cache"
+  - label: "Diffusion Sequence Parallelism Test"
+    timeout_in_minutes: 20
+    depends_on: image-build
+    commands:
+      - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py
+    agents:
+      queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
+    plugins:
+      - docker#v5.2.0:
+          image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+          always-pull: true
+          propagate-environment: true
+          shm-size: "8gb"
+          environment:
+            - "HF_HOME=/fsx/hf_cache"
+          volumes:
+            - "/fsx/hf_cache:/fsx/hf_cache"
+  - label: "Diffusion Tensor Parallelism Test"
+    timeout_in_minutes: 20
+    depends_on: image-build
+    commands:
+      - pytest -s -v tests/e2e/offline_inference/test_zimage_tensor_parallel.py
+    agents:
+      queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
+    plugins:
+      - docker#v5.2.0:
+          image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+          always-pull: true
+          propagate-environment: true
+          shm-size: "8gb"
+          environment:
+            - "HF_HOME=/fsx/hf_cache"
+          volumes:
+            - "/fsx/hf_cache:/fsx/hf_cache"
+  - label: "Diffusion GPU Worker Test"
+    timeout_in_minutes: 20
+    depends_on: image-build
+    commands:
+      - pytest -s -v tests/diffusion/test_diffusion_worker.py
+    agents:
+      queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
+    plugins:
+      - docker#v5.2.0:
+          image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+          always-pull: true
+          propagate-environment: true
+          shm-size: "8gb"
+          environment:
+            - "HF_HOME=/fsx/hf_cache"
+          volumes:
+            - "/fsx/hf_cache:/fsx/hf_cache"
+  - label: "Benchmark Test"
+    timeout_in_minutes: 15
+    depends_on: image-build
+    commands:
+      - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+      - pytest -s -v tests/benchmarks/test_serve_cli.py
+    agents:
+      queue: "mithril-h100-pool"
+    plugins:
+      - kubernetes:
+          podSpec:
+            containers:
+              - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                resources:
+                  limits:
+                    nvidia.com/gpu: 2
+                volumeMounts:
+                  - name: devshm
+                    mountPath: /dev/shm
+                  - name: hf-cache
+                    mountPath: /root/.cache/huggingface
+                env:
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+            nodeSelector:
+              node.kubernetes.io/instance-type: gpu-h100-sxm
+            volumes:
+              - name: devshm
+                emptyDir:
+                  medium: Memory
+              - name: hf-cache
+                hostPath:
+                  path: /mnt/hf-cache
+                  type: DirectoryOrCreate
+  - label: "Omni Model Test"
+    timeout_in_minutes: 15
+    depends_on: image-build
+    commands:
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
+    agents:
+      queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
+    plugins:
+    - docker#v5.2.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+        always-pull: true
+        propagate-environment: true
+        environment:
+        - "HF_HOME=/fsx/hf_cache"
+        volumes:
+        - "/fsx/hf_cache:/fsx/hf_cache"
+  # - label: "Omni Model Test with H100"
+  #   timeout_in_minutes: 30
+  #   depends_on: image-build
+  #   commands:
+  #     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  #     - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
+  #     - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
+  #     - pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py
+  #     - pytest -s -v tests/e2e/online_serving/test_async_omni.py
+  #   agents:
+  #     queue: "mithril-h100-pool"
+  #   plugins:
+  #     - kubernetes:
+  #         podSpec:
+  #           containers:
+  #             - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+  #               resources:
+  #                 limits:
+  #                   nvidia.com/gpu: 2
+  #               volumeMounts:
+  #                 - name: devshm
+  #                   mountPath: /dev/shm
+  #                 - name: hf-cache
+  #                   mountPath: /root/.cache/huggingface
+  #               env:
+  #                 - name: HF_HOME
+  #                   value: /root/.cache/huggingface
+  #           nodeSelector:
+  #             node.kubernetes.io/instance-type: gpu-h100-sxm
+  #           volumes:
+  #             - name: devshm
+  #               emptyDir:
+  #                 medium: Memory
+  #             - name: hf-cache
+  #               hostPath:
+  #                 path: /mnt/hf-cache
+  #                 type: DirectoryOrCreate
+  - label: "Diffusion Image Edit Test with H100 (1 GPU)"
+    timeout_in_minutes: 20
+    depends_on: image-build
+    commands:
+      - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+      - pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py
+    agents:
+      queue: "mithril-h100-pool"
+    plugins:
+      - kubernetes:
+          podSpec:
+            containers:
+              - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                resources:
+                  limits:
+                    nvidia.com/gpu: 1
+                volumeMounts:
+                  - name: devshm
+                    mountPath: /dev/shm
+                  - name: hf-cache
+                    mountPath: /root/.cache/huggingface
+                env:
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+            nodeSelector:
+              node.kubernetes.io/instance-type: gpu-h100-sxm
+            volumes:
+              - name: devshm
+                emptyDir:
+                  medium: Memory
+              - name: hf-cache
+                hostPath:
+                  path: /mnt/hf-cache
+                  type: DirectoryOrCreate
+  # - label: "Bagel Text2Img Model Test with H100"
+  #   timeout_in_minutes: 30
+  #   depends_on: image-build
+  #   commands:
+  #     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  #     - pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py
+  #   agents:
+  #     queue: "mithril-h100-pool"
+  #   plugins:
+  #     - kubernetes:
+  #         podSpec:
+  #           containers:
+  #             - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+  #               resources:
+  #                 limits:
+  #                   nvidia.com/gpu: 1
+  #               volumeMounts:
+  #                 - name: devshm
+  #                   mountPath: /dev/shm
+  #                 - name: hf-cache
+  #                   mountPath: /root/.cache/huggingface
+  #               env:
+  #                 - name: HF_HOME
+  #                   value: /root/.cache/huggingface
+  #           nodeSelector:
+  #             node.kubernetes.io/instance-type: gpu-h100-sxm
+  #           volumes:
+  #             - name: devshm
+  #               emptyDir:
+  #                 medium: Memory
+  #             - name: hf-cache
+  #               hostPath:
+  #                 path: /mnt/hf-cache
+  #                 type: DirectoryOrCreate
--- a/.buildkite/scripts/docker_login_ecr_public.sh
+++ b/.buildkite/scripts/docker_login_ecr_public.sh
+#!/bin/bash
+# Helper function to safely login to ECR Public with per-job config isolation
+# Uses DOCKER_CONFIG environment variable to prevent race conditions
+#
+# This script prevents the "device or resource busy" error by giving each
+# Buildkite job its own isolated Docker config directory.
+#
+# Usage:
+#   source docker_login_ecr_public.sh && safe_docker_login_ecr_public
+set -euo pipefail
+# Configuration
+ECR_REGISTRY="public.ecr.aws"
+setup_isolated_docker_config() {
+    # Use BUILDKITE_JOB_ID for job-specific isolation
+    # Fallback to PID if running outside Buildkite
+    local job_id="${BUILDKITE_JOB_ID:-$$}"
+    # Set Docker config to job-specific directory
+    export DOCKER_CONFIG="/tmp/docker-config-${job_id}"
+    # Create directory if it doesn't exist
+    mkdir -p "$DOCKER_CONFIG"
+    echo "[docker-config] Using isolated Docker config: $DOCKER_CONFIG"
+}
+check_docker_auth() {
+    # Check if already authenticated to the given registry
+    # Returns 0 if authenticated, 1 if not
+    local registry="$1"
+    # Check if credentials exist in the isolated config
+    if [[ -f "$DOCKER_CONFIG/config.json" ]]; then
+        # Check if registry is present in config
+        if grep -q "$registry" "$DOCKER_CONFIG/config.json" 2>/dev/null; then
+            return 0
+        fi
+    fi
+    return 1
+}
+safe_docker_login_ecr_public() {
+    # Setup isolated config first
+    setup_isolated_docker_config
+    local registry="$ECR_REGISTRY"
+    # Check if already authenticated (within this job)
+    if check_docker_auth "$registry"; then
+        echo "[docker-login] Already authenticated to $registry in this job"
+        return 0
+    fi
+    # Perform login to isolated config directory
+    echo "[docker-login] Logging in to $ECR_REGISTRY (isolated config)..."
+    if aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$ECR_REGISTRY"; then
+        echo "[docker-login] Login successful (config: $DOCKER_CONFIG)"
+        return 0
+    else
+        local exit_code=$?
+        echo "[docker-login] ERROR: Login failed with exit code $exit_code" >&2
+        return $exit_code
+    fi
+}
+# Execute if run as script (not sourced)
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+    safe_docker_login_ecr_public
+fi
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
+#!/bin/bash
+# vllm-omni customized version
+# Based on: vllm/.buildkite/scripts/hardware_ci/run-amd-test.sh
+# Last synced: 2025-12-15
+# Modifications: docker image name for vllm-omni
+# This script runs test inside the corresponding ROCm docker container.
+set -o pipefail
+# Export Python path
+export PYTHONPATH=".."
+# Print ROCm version
+echo "--- Confirming Clean Initial State"
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done
+echo "--- ROCm info"
+rocminfo
+# cleanup older docker images
+cleanup_docker() {
+  # Get Docker's root directory
+  docker_root=$(docker info -f '{{.DockerRootDir}}')
+  if [ -z "$docker_root" ]; then
+    echo "Failed to determine Docker root directory."
+    exit 1
+  fi
+  echo "Docker root directory: $docker_root"
+  # Check disk usage of the filesystem where Docker's root directory is located
+  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+  # Define the threshold
+  threshold=70
+  if [ "$disk_usage" -gt "$threshold" ]; then
+    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+    # Remove dangling images (those that are not tagged and not used by any container)
+    docker image prune -f
+    # Remove unused volumes / force the system prune for old images as well.
+    docker volume prune -f && docker system prune --force --filter "until=72h" --all
+    echo "Docker images and volumes cleanup completed."
+  else
+    echo "Disk usage is below $threshold%. No cleanup needed."
+  fi
+}
+# Call the cleanup docker function
+cleanup_docker
+echo "--- Resetting GPUs"
+echo "reset" > /opt/amdgpu/etc/gpu_state
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done
+echo "--- Pulling container"
+image_name="public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${BUILDKITE_COMMIT}-rocm-omni"
+container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+# Install AWS CLI to authenticate to ECR Public Gallery to get higher rate limit for pulling images
+sudo apt-get update && sudo apt-get install -y awscli
+# Use safe docker login helper to prevent race conditions
+source "$(dirname "${BASH_SOURCE[0]}")/../docker_login_ecr_public.sh"
+safe_docker_login_ecr_public
+# Pull the container from ECR Public Gallery
+docker pull "${image_name}"
+remove_docker_container() {
+   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
+}
+trap remove_docker_container EXIT
+echo "--- Running container"
+HF_CACHE="$(realpath ~)/huggingface"
+mkdir -p "${HF_CACHE}"
+HF_MOUNT="/root/.cache/huggingface"
+commands=$@
+echo "Commands:$commands"
+PARALLEL_JOB_COUNT=8
+MYPYTHONPATH=".."
+# Test that we're launching on the machine that has
+# proper access to GPUs
+render_gid=$(getent group render | cut -d: -f3)
+if [[ -z "$render_gid" ]]; then
+  echo "Error: 'render' group not found. This is required for GPU access." >&2
+  exit 1
+fi
+# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
+if [[ $commands == *"--shard-id="* ]]; then
+  # assign job count as the number of shards used
+  commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
+  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
+    # assign shard-id for each shard
+    commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
+    echo "Shard ${GPU} commands:$commands_gpu"
+    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
+    docker run \
+        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+        --network=host \
+        --shm-size=16gb \
+        --group-add "$render_gid" \
+        --rm \
+        -e MIOPEN_DEBUG_CONV_DIRECT=0 \
+        -e MIOPEN_DEBUG_CONV_GEMM=0 \
+        -e VLLM_ROCM_USE_AITER=1 \
+        -e HIP_VISIBLE_DEVICES="${GPU}" \
+        -e HF_TOKEN \
+        -e AWS_ACCESS_KEY_ID \
+        -e AWS_SECRET_ACCESS_KEY \
+        -v "${HF_CACHE}:${HF_MOUNT}" \
+        -e "HF_HOME=${HF_MOUNT}" \
+        -e "PYTHONPATH=${MYPYTHONPATH}" \
+        --name "${container_name}_${GPU}" \
+        "${image_name}" \
+        /bin/bash -c "${commands_gpu}" \
+        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
+    PIDS+=($!)
+  done
+  #wait for all processes to finish and collect exit codes
+  for pid in "${PIDS[@]}"; do
+    wait "${pid}"
+    STATUS+=($?)
+  done
+  for st in "${STATUS[@]}"; do
+    if [[ ${st} -ne 0 ]]; then
+      echo "One of the processes failed with $st"
+      exit "${st}"
+    fi
+  done
+else
+  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
+  docker run \
+          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+          --network=host \
+          --shm-size=16gb \
+          --group-add "$render_gid" \
+          --rm \
+          -e MIOPEN_DEBUG_CONV_DIRECT=0 \
+          -e MIOPEN_DEBUG_CONV_GEMM=0 \
+          -e VLLM_ROCM_USE_AITER=1 \
+          -e HF_TOKEN \
+          -e AWS_ACCESS_KEY_ID \
+          -e AWS_SECRET_ACCESS_KEY \
+          -v "${HF_CACHE}:${HF_MOUNT}" \
+          -e "HF_HOME=${HF_MOUNT}" \
+          -e "PYTHONPATH=${MYPYTHONPATH}" \
+          --name "${container_name}" \
+          "${image_name}" \
+          /bin/bash -c "${commands}"
+fi
--- a/.buildkite/scripts/hardware_ci/run_npu_test.sh
+++ b/.buildkite/scripts/hardware_ci/run_npu_test.sh
+#!/bin/bash
+# This script build the Ascend NPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+# Base ubuntu image with basic ascend development libraries and python installed
+VLLM_OMNI_REPO="https://github.com/vllm-project/vllm-omni.git"
+BASE_IMAGE_NAME="quay.nju.edu.cn/ascend/vllm-ascend:v0.11.0rc2"
+image_name="npu/vllm-omni-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
+# image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
+container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
+agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
+echo "agent_idx: ${agent_idx}"
+builder_name="cachebuilder${agent_idx}"
+builder_cache_dir="/mnt/docker-cache${agent_idx}"
+mkdir -p ${builder_cache_dir}
+# Try building the docker image
+cat <<EOF | DOCKER_BUILDKIT=1 docker build \
+    --add-host pypi-cache:${PYPI_CACHE_HOST} \
+    --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
+                           --cache-to type=local,dest=${builder_cache_dir},mode=max \
+    --build-arg BUILDKITE_PULL_REQUEST="${BUILDKITE_PULL_REQUEST}" \
+    --build-arg BUILDKITE_PULL_REQUEST_REPO="${BUILDKITE_PULL_REQUEST_REPO}" \
+    --progress=plain --load -t ${image_name} -f - .
+FROM ${BASE_IMAGE_NAME}
+# Define environments
+ENV DEBIAN_FRONTEND=noninteractive
+RUN pip config set global.index-url http://pypi-cache:${PYPI_CACHE_PORT}/pypi/simple && \
+    pip config set global.trusted-host pypi-cache && \
+    apt-get update -y && \
+    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
+    rm -rf /var/cache/apt/* && \
+    rm -rf /var/lib/apt/lists/*
+# Install for pytest to make the docker build cache layer always valid
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install pytest>=6.0  pytest-cov modelscope
+COPY . .
+# Install vllm-omni
+WORKDIR /workspace
+ARG VLLM_OMNI_REPO=https://github.com/vllm-project/vllm-omni.git
+ARG VLLM_OMNI_TAG=main
+ARG BUILDKITE_PULL_REQUEST
+ARG BUILDKITE_PULL_REQUEST_REPO
+RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
+    if [ "\$BUILDKITE_PULL_REQUEST" != "false" ] && [ -n "\$BUILDKITE_PULL_REQUEST" ]; then \
+        echo "Cloning and checking out PR #\$BUILDKITE_PULL_REQUEST..." && \
+        git clone \$VLLM_OMNI_REPO /workspace/vllm-omni && \
+        cd /workspace/vllm-omni && \
+        git fetch origin pull/\$BUILDKITE_PULL_REQUEST/head:pr-\$BUILDKITE_PULL_REQUEST && \
+        git checkout pr-\$BUILDKITE_PULL_REQUEST; \
+    else \
+        echo "Not a PR build, using main branch" && \
+        git clone --depth 1 \$VLLM_OMNI_REPO /workspace/vllm-omni; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/pip \
+    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    python3 -m pip install -v -e /workspace/vllm-omni/
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+WORKDIR /workspace/vllm-omni
+CMD ["/bin/bash"]
+EOF
+# Setup cleanup
+remove_docker_container() {
+  docker rm -f "${container_name}" || true;
+  docker image rm -f "${image_name}" || true;
+  docker system prune -f || true;
+}
+trap remove_docker_container EXIT
+# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
+# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
+#   e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
+#   returns --device /dev/davinci0 --device /dev/davinci1
+parse_and_gen_devices() {
+    local input="$1"
+    local index cards_num
+    if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
+        index="${BASH_REMATCH[1]}"
+        cards_num="${BASH_REMATCH[2]}"
+    else
+        echo "parse error" >&2
+        return 1
+    fi
+    local devices=""
+    local i=0
+    while (( i < cards_num )); do
+        local dev_idx=$(((index - 1)*cards_num + i ))
+        devices="$devices --device /dev/davinci${dev_idx}"
+        ((i++))
+    done
+    # trim leading space
+    devices="${devices#"${devices%%[![:space:]]*}"}"
+    # Output devices: assigned to the caller variable
+    printf '%s' "$devices"
+}
+devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
+# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
+# This test checks whether the OOT platform interface is functioning properly in conjunction with
+# the hardware plugin vllm-ascend.
+hf_model_cache_dir=/mnt/hf_cache${agent_idx}
+ms_model_cache_dir=/mnt/modelscope${agent_idx}
+mkdir -p ${hf_model_cache_dir}
+mkdir -p ${ms_model_cache_dir}
+docker run \
+    --init \
+    ${devices} \
+    --device /dev/davinci_manager \
+    --device /dev/devmm_svm \
+    --device /dev/hisi_hdc \
+    -v /usr/local/dcmi:/usr/local/dcmi \
+    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
+    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
+    -v /etc/ascend_install.info:/etc/ascend_install.info \
+    -v ${hf_model_cache_dir}:/root/.cache/huggingface \
+    -v ${ms_model_cache_dir}:/root/.cache/modelscope \
+    --network host \
+    --entrypoint="" \
+    --name "${container_name}" \
+    "${image_name}" \
+    bash -c '
+    set -e
+    VLLM_USE_MODELSCOPE=True pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
+'
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
+steps:
+- label: "Diffusion Model Test"
+  timeout_in_minutes: 20
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - pytest -s -v tests/e2e/offline_inference/test_t2i_model.py
+- label: "Diffusion Images API LoRA E2E"
+  timeout_in_minutes: 20
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v tests/e2e/online_serving/test_images_generations_lora.py
+- label: "Diffusion Model CPU offloading Test"
+  timeout_in_minutes: 20
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
+- label: "Diffusion Cache Backend Test"
+  timeout_in_minutes: 15
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v tests/e2e/offline_inference/test_cache_dit.py tests/e2e/offline_inference/test_teacache.py
+- label: "Diffusion Sequence Parallelism Test"
+  timeout_in_minutes: 20
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py
+- label: "Diffusion Tensor Parallelism Test"
+  timeout_in_minutes: 20
+  agent_pool: mi325_2
+  depends_on: amd-build
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v tests/e2e/offline_inference/test_zimage_tensor_parallel.py
+- label: "Diffusion GPU Worker Test"
+  timeout_in_minutes: 20
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - pytest -s -v tests/diffusion/test_diffusion_worker.py
+- label: "Omni Model Test Qwen2-5-Omni"
+  timeout_in_minutes: 15
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
+- label: "Omni Model Test Qwen3-Omni"
+  timeout_in_minutes: 15
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
+    - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
+    - pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py
+    - pytest -s -v tests/e2e/online_serving/test_async_omni.py
+- label: "Diffusion Image Edit Test"
+  timeout_in_minutes: 15
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py
--- a/.buildkite/test-template-amd-omni.j2
+++ b/.buildkite/test-template-amd-omni.j2
+{# vllm-omni customized version
+   Based on: https://github.com/vllm-project/ci-infra/blob/main/buildkite/test-template-amd.j2
+   Last synced: 2025-12-15
+   Modifications: Removed unused CUDA/NVIDIA logic, keeping only AMD tests
+#}
+{% set docker_image_amd = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT-rocm-omni" %}
+{% set default_working_dir = "/app/vllm-omni" %}
+  - group: "AMD Tests"
+    depends_on: ~
+    steps:
+      - label: "AMD: :docker: build image"
+        depends_on: ~
+        soft_fail: false
+        commands:
+          - "source .buildkite/scripts/docker_login_ecr_public.sh && safe_docker_login_ecr_public"
+          - "docker build -f docker/Dockerfile.rocm -t {{ docker_image_amd }} --target final --progress plain ."
+          - "docker push {{ docker_image_amd }}"
+        key: "amd-build"
+        env:
+          DOCKER_BUILDKIT: "1"
+        retry:
+          automatic:
+            - exit_status: -1  # Agent was lost
+              limit: 1
+            - exit_status: -10  # Agent was lost
+              limit: 1
+            - exit_status: 1  # Machine occasionally fail
+              limit: 1
+        agents:
+          queue: cpu_queue_premerge
+    {% for step in steps %}
+    {% if step.mirror_hardwares and mirror_hw in step.mirror_hardwares %}
+      - label: "{{ step.agent_pool }}: {{ step.label }}"
+        depends_on: amd-build
+        agents:
+          {% if step.agent_pool %}
+          queue: amd_{{ step.agent_pool }}
+          {% else %}
+          queue: amd_mi325_1
+          {% endif %}
+        command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh "(command rocm-smi || true) && cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" && ")) | safe }}"
+        env:
+          DOCKER_BUILDKIT: "1"
+        priority: 100
+        {% if step.grade and step.grade == "Blocking" %}
+        soft_fail: false
+        {% else %}
+        soft_fail: true
+        {% endif%}
+    {% endif %}
+    {% endfor %}
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
+default_install_hook_types:
+  - pre-commit
+  - commit-msg
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v6.0.0
+    hooks:
+      # list of supported hooks: https://pre-commit.com/hooks.html
+      - id: check-yaml
+        args: ["--unsafe"]
+      - id: debug-statements
+      - id: end-of-file-fixer
+      - id: mixed-line-ending
+        args: ["--fix=lf"]
+      - id: trailing-whitespace
+        args: ["--markdown-linebreak-ext=md"]
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.14.10
+    hooks:
+    - id: ruff-check
+      args: [--output-format, github, --fix]
+    - id: ruff-format
+  - repo: https://github.com/crate-ci/typos
+    rev: typos-dict-v0.13.13
+    hooks:
+      - id: typos
+        # only for staged files
+  - repo: https://github.com/rhysd/actionlint
+    # v1.7.8+ sets `go 1.24.0` in go.mod, which older Go toolchains (and most
+    # current CI images) cannot parse. Pin to v1.7.7 until actionlint fixes the
+    # go.mod directive.
+    rev: v1.7.7
+    hooks:
+      - id: actionlint
+        files: ^\.github/workflows/.*\.ya?ml$
+  - repo: local
+    hooks:
+    - id: signoff-commit
+      name: Sign-off Commit
+      entry: bash
+      args:
+        - -c
+        - |
+          if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then
+            printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)"
+          fi
+      language: system
+      verbose: true
+      stages: [commit-msg]
+    # Keep `suggestion` last
+    - id: suggestion
+      name: Suggestion
+      entry: bash -c 'echo "To bypass all the pre-commit hooks, add --no-verify to git commit. To skip a specific hook, prefix the commit command with SKIP=<hook-id>."'
+      language: system
+      verbose: true
+      pass_filenames: false
+    # Insert new entries above the `suggestion` entry
+    - id: check-pickle-imports
+      name: Prevent new pickle/cloudpickle imports
+      entry: python tools/pre_commit/check_pickle_imports.py
+      language: python
+      types: [python]
+      additional_dependencies: [regex]
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+version: 2
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.12"
+  jobs:
+    post_checkout:
+      - git fetch --unshallow || true
+mkdocs:
+  configuration: mkdocs.yml
+  fail_on_warning: true
+# Optionally declare the Python requirements required to build your docs
+python:
+  install:
+    - method: pip
+      path: .
+      extra_requirements:
+        - docs
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
+# Contributing to vLLM-Omni
+You may find information about contributing to vLLM-Omni on [Contributing](https://vllm-omni.readthedocs.io/en/latest/contributing/)
--- a/LICENSE
+++ b/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/README.md
+++ b/README.md
-# vllm-omni
+<p align="center">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" src="https://raw.githubusercontent.com/vllm-project/vllm-omni/refs/heads/main/docs/source/logos/vllm-omni-logo.png">
+    <img alt="vllm-omni" src="https://raw.githubusercontent.com/vllm-project/vllm-omni/refs/heads/main/docs/source/logos/vllm-omni-logo.png" width=55%>
+  </picture>
+</p>
+<h3 align="center">
+Easy, fast, and cheap omni-modality model serving for everyone
+</h3>
-vLLM 最初是为支持文本生成任务的大型语言模型而设计的。vLLM-Omni 是一个框架，它将 vLLM 的支持扩展到全模态模型推理和服务的领域。
+<p align="center">
\ No newline at end of file
+| <a href="https://vllm-omni.readthedocs.io/en/latest/"><b>Documentation</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> | <a href="docs/assets/WeChat.jpg"><b>WeChat</b></a> |
+</p>
+---
+*Latest News* 🔥
+- [2026/02] We released [0.14.0](https://github.com/vllm-project/vllm-omni/releases/tag/v0.14.0) - This is the first **stable release** of vLLM-Omni that expands Omni’s diffusion / image-video generation and audio / TTS stack, improves distributed execution and memory efficiency, and broadens platform/backend coverage (GPU/ROCm/NPU/XPU). It also brings meaningful upgrades to serving APIs, profiling & benchmarking, and overall stability. Please check our latest [paper](https://arxiv.org/abs/2602.02204) for architecture design and performance results.
+- [2026/01] We released [0.12.0rc1](https://github.com/vllm-project/vllm-omni/releases/tag/v0.12.0rc1) - a major RC milestone focused on maturing the diffusion stack, strengthening OpenAI-compatible serving, expanding omni-model coverage, and improving stability across platforms (GPU/NPU/ROCm), please check our latest [design](https://docs.google.com/presentation/d/1qv4qMW1rKAqDREMXiUDLIgqqHQe7TDPj/edit?usp=sharing&ouid=110473603432222024453&rtpof=true&sd=true).
+- [2025/11] vLLM community officially released [vllm-project/vllm-omni](https://github.com/vllm-project/vllm-omni) in order to support omni-modality models serving.
+---
+## About
+[vLLM](https://github.com/vllm-project/vllm) was originally designed to support large language models for text-based autoregressive generation tasks. vLLM-Omni is a framework that extends its support for omni-modality model inference and serving:
+- **Omni-modality**: Text, image, video, and audio data processing
+- **Non-autoregressive Architectures**: extend the AR support of vLLM to Diffusion Transformers (DiT) and other parallel generation models
+- **Heterogeneous outputs**: from traditional text generation to multimodal outputs
+<p align="center">
+  <picture>
+    <img alt="vllm-omni" src="https://raw.githubusercontent.com/vllm-project/vllm-omni/refs/heads/main/docs/source/architecture/omni-modality-model-architecture.png" width=55%>
+  </picture>
+</p>
+vLLM-Omni is fast with:
+- State-of-the-art AR support by leveraging efficient KV cache management from vLLM
+- Pipelined stage execution overlapping for high throughput performance
+- Fully disaggregation based on OmniConnector and dynamic resource allocation across stages
+vLLM-Omni is flexible and easy to use with:
+- Heterogeneous pipeline abstraction to manage complex model workflows
+- Seamless integration with popular Hugging Face models
+- Tensor, pipeline, data and expert parallelism support for distributed inference
+- Streaming outputs
+- OpenAI-compatible API server
+vLLM-Omni seamlessly supports most popular open-source models on HuggingFace, including:
+- Omni-modality models (e.g. Qwen-Omni)
+- Multi-modality generation models (e.g. Qwen-Image)
+## Getting Started
+Visit our [documentation](https://vllm-omni.readthedocs.io/en/latest/) to learn more.
+- [Installation](https://vllm-omni.readthedocs.io/en/latest/getting_started/installation/)
+- [Quickstart](https://vllm-omni.readthedocs.io/en/latest/getting_started/quickstart/)
+- [List of Supported Models](https://vllm-omni.readthedocs.io/en/latest/models/supported_models/)
+## Contributing
+We welcome and value any contributions and collaborations.
+Please check out [Contributing to vLLM-Omni](https://vllm-omni.readthedocs.io/en/latest/contributing/) for how to get involved.
+## Citation
+If you use vLLM-Omni for your research, please cite our [paper](https://arxiv.org/abs/2602.02204):
+```bibtex
+@article{yin2026vllmomni,
+  title={vLLM-Omni: Fully Disaggregated Serving for Any-to-Any Multimodal Models},
+  author={Peiqi Yin, Jiangyun Zhu, Han Gao, Chenguang Zheng, Yongxiang Huang, Taichang Zhou, Ruirui Yang, Weizhi Liu, Weiqing Chen, Canlin Guo, Didan Deng, Zifeng Mo, Cong Wang, James Cheng, Roger Wang, Hongsheng Liu},
+  journal={arXiv preprint arXiv:2602.02204},
+  year={2026}
+}
+```
+## Join the Community
+Feel free to ask questions, provide feedbacks and discuss with fellow users of vLLM-Omni in `#sig-omni` slack channel at [slack.vllm.ai](https://slack.vllm.ai) or vLLM user forum at [discuss.vllm.ai](https://discuss.vllm.ai).
+## Star History
+[![Star History Chart](https://api.star-history.com/svg?repos=vllm-project/vllm-omni&type=date&legend=top-left)](https://www.star-history.com/#vllm-project/vllm-omni&type=date&legend=top-left)
+## License
+Apache License 2.0, as found in the [LICENSE](./LICENSE) file.
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
+# Benchmarks Overview and Architecture
+This document explains the benchmark architecture across all benchmark assets in this repo. It describes what we measure, and where to find or plug in new scenarios. Per-task details remain in subfolder READMEs (e.g., `benchmarks/<model>/README.md`).
+## Scope and goals
+- Establish repeatable latency/throughput measurements for multimodal LLM pipelines.
+- Provide both HF Transformers (offline) and vLLM-Omni (multi-stage/pipeline) baselines.
+- Make it easy to plug in new datasets and models with minimal changes to the runner scripts.
+## Dataset and inputs
+- Default example: SeedTTS top-100 prompts (`benchmarks/build_dataset/top100.txt`) via `benchmarks/build_dataset/`.
+- Extensible: drop in new prompt files or modality-aligned payloads; keep the expected format for the consuming scripts (e.g., one prompt per line).
+- If you add a new dataset, document it under `benchmarks/<model>/README.md` and point scripts to your data path.
+## Directory layout
+- `benchmarks/build_dataset/` — dataset prep utilities (e.g., SeedTTS top100).
+- `benchmarks/<model>/vllm_omni/` — vLLM-Omni pipeline benchmarks, logs, outputs.
+- Add new tasks under `benchmarks/<model>/...` with the same pattern: `transformers/`, `vllm_omni/`, task-specific README, and (optionally) dataset prep notes.
+## Reference workflows
+- **HF Transformers (offline, single process)**  
+  Script (example): `benchmarks/<model>/transformers/eval_qwen3_moe_omni_transformers.sh`  
+  Outputs: `benchmark_results/perf_stats.json`, `benchmark_results/results.json`, `benchmark_results/audio/` (if audio is produced).
+- **vLLM-Omni end-to-end pipeline**  
+  Script (example): `benchmarks/<model>/vllm_omni/eval_qwen3_moe_omni.sh`  
+  Outputs: `vllm_omni/logs/*.stats.jsonl` (per-stage/overall latency & TPS), `vllm_omni/logs/stage*.log`, `vllm_omni/outputs/` (text/audio artifacts).
+- **Adding a new task/model**  
+  1) Create `benchmarks/<model>/transformers/` and/or `benchmarks/<model>/vllm_omni/` with scripts referencing your model and dataset.  
+  2) Add a task README describing dataset, configs, and expected outputs.  
+  3) Keep the output/log structure similar for easy comparison (perf_stats/results/audio or text outputs; stats.jsonl/logs for pipeline).
+## Metrics to watch
+- **Throughput**: `overall_tps`, `*_tps_avg` per stage.
+- **Latency distribution**: look for long tails in `*.stats.jsonl`.
+- **Quality/completeness**: missing outputs or errors in stage logs indicate pipeline failures or misconfigurations.
+## Troubleshooting
+- Verify GPU/driver/FlashAttention2 requirements for your chosen model/config.
+- Ensure network access for dataset/model downloads (Google Drive, Hugging Face, etc.).
+- If outputs are missing or slow, inspect per-stage logs and `*.stats.jsonl` for errors, stragglers, or contention.
--- a/benchmarks/build_dataset/download_process_data_seedtts.md
+++ b/benchmarks/build_dataset/download_process_data_seedtts.md
+# Benchmark Dataset Preparation Guide
+This guide describes how to download and prepare the SeedTTS test dataset for benchmarking Qwen-Omni models.
+## Prerequisites
+- Python 3.8+
+- `gdown` for downloading from Google Drive
+- Access to the benchmark scripts
+## Steps
+### 1. Navigate to the Dataset Directory
+```bash
+cd benchmarks/build_dataset
+```
+### 2. Install Dependencies
+```bash
+pip install gdown
+```
+### 3. Download the SeedTTS Test Dataset
+Download the dataset from Google Drive:
+```bash
+gdown --id 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP
+```
+### 4. Extract the Dataset
+```bash
+tar -xf seedtts_testset.tar
+```
+### 5. Prepare the Metadata File
+Copy the English metadata file to the working directory:
+```bash
+cp seedtts_testset/en/meta.lst meta.lst
+```
+### 6. Extract Prompts
+Extract the first N prompts from the metadata file:
+```bash
+# Extract top 100 prompts (adjust -n for different amounts)
+python extract_tts_prompts.py -i meta.lst -o top100.txt -n 100
+```
+**Options:**
+- `-i, --input`: Input metadata file (default: `meta.lst`)
+- `-o, --output`: Output prompts file (default: `prompts.txt`)
+- `-n, --num_lines`: Number of prompts to extract (required)
+### 7. Clean Up (Optional)
+Remove temporary files to save disk space:
+```bash
+rm -rf seedtts_testset
+rm seedtts_testset.tar
+rm meta.lst
+```
+## Quick Start (All-in-One)
+```bash
+# Full setup and benchmark
+cd benchmarks/build_dataset
+pip install gdown
+gdown --id 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP
+tar -xf seedtts_testset.tar
+cp seedtts_testset/en/meta.lst meta.lst
+python extract_tts_prompts.py -i meta.lst -o top100.txt -n 100
+rm -rf seedtts_testset seedtts_testset.tar meta.lst
+```
--- a/benchmarks/build_dataset/extract_tts_prompts.py
+++ b/benchmarks/build_dataset/extract_tts_prompts.py
+#!/usr/bin/env python3
+"""
+Extract prompts from meta.lst and save them to a txt file.
+Each line in meta.lst has the format:
+ID|prompt_text|audio_path|target_text
+This script extracts the prompt_text (second field) from the first N lines.
+"""
+import argparse
+from pathlib import Path
+def extract_prompts(input_file: str, output_file: str, num_lines: int) -> None:
+    """
+    Extract prompts from meta.lst and save to output file.
+    Args:
+        input_file: Path to the meta.lst file
+        output_file: Path to the output txt file
+        num_lines: Number of lines to process
+    """
+    prompts = []
+    with open(input_file, encoding="utf-8") as f:
+        for i, line in enumerate(f):
+            if i >= num_lines:
+                break
+            line = line.strip()
+            if not line:  # Skip empty lines
+                continue
+            parts = line.split("|")
+            if len(parts) >= 2:
+                prompt = parts[1]  # The prompt is the second field
+                prompts.append(prompt)
+    # Write prompts to output file
+    with open(output_file, "w", encoding="utf-8") as f:
+        for prompt in prompts:
+            f.write(prompt + "\n")
+    # Print result stats
+    print(f"Extracted {len(prompts)} prompts from first {num_lines} lines")
+    print(f"Saved to: {output_file}")
+def main():
+    parser = argparse.ArgumentParser(description="Extract prompts from meta.lst file")
+    parser.add_argument(
+        "-i", "--input", type=str, default="meta.lst", help="Input meta.lst file path (default: meta.lst)"
+    )
+    parser.add_argument(
+        "-o", "--output", type=str, default="prompts.txt", help="Output txt file path (default: prompts.txt)"
+    )
+    parser.add_argument(
+        "-n", "--num_lines", type=int, required=True, help="Number of lines to extract from the beginning"
+    )
+    args = parser.parse_args()
+    # Check if input file exists
+    if not Path(args.input).exists():
+        print(f"Error: Input file '{args.input}' not found")
+        return
+    extract_prompts(args.input, args.output, args.num_lines)
+if __name__ == "__main__":
+    main()
--- a/benchmarks/diffusion/README.md
+++ b/benchmarks/diffusion/README.md
+# Diffusion Serving Benchmark (Image/Video)
+This folder contains an online-serving benchmark script for diffusion models.
+It sends requests to a vLLM OpenAI-compatible endpoint and reports throughput,
+latency percentiles, and optional SLO attainment.
+The main entrypoint is:
+- `benchmarks/diffusion/diffusion_benchmark_serving.py`
+## 1. Quick Start
+1. Start the server:
+```bash
+vllm serve Qwen/Qwen-Image --omni --port 8099
+```
+2. Run a minimal benchmark:
+```bash
+python3 benchmarks/diffusion/diffusion_benchmark_serving.py \
+	--base-url http://localhost:8099 \
+	--model Qwen/Qwen-Image \
+	--task t2i \
+	--dataset vbench \
+	--num-prompts 5
+```
+**Notes**
+- The benchmark talks to `http://<host>:<port>/v1/chat/completions`.
+- If you run the server on another host or port, pass `--base-url` accordingly.
+## 2. Supported Datasets
+The benchmark supports three dataset modes via `--dataset`:
+- `vbench`: Built-in prompt/data loader.
+- `trace`: Heterogeneous request traces (each request can have different resolution/frames/steps).
+- `random`: Synthetic prompts for quick smoke tests.
+### VBench dataset
+If you use i2v/i2i bench datasets and need auto-download support, you may need:
+```bash
+uv pip install gdown
+```
+### Trace dataset
+Use `--dataset trace` to replay a trace file. The trace can specify per-request fields such as:
+- `width`, `height`
+- `num_frames` (video)
+- `num_inference_steps`
+- `seed`, `fps`
+- optional `slo_ms` (per-request SLO target)
+By default (when `--dataset-path` is not provided), the script downloads a default trace from
+the HuggingFace dataset repo `asukaqaqzz/Dit_Trace`. The default filename can depend on `--task`
+(e.g., `t2v` uses a video trace).
+Current defaults:
+- `--task t2i` -> `sd3_trace.txt`
+- `--task t2v` -> `cogvideox_trace.txt`
+You can point to your own trace using `--dataset-path`.
+## 3. Benchmark Parameters
+### Basic flags
+- `--base-url`: Server address (the script calls `.../v1/chat/completions`).
+- `--model`: The OpenAI-compatible `model` field.
+- `--task`: Task type (e.g., `t2i`, `t2v`, `i2i`, `i2v`).
+- `--dataset`: Dataset mode (`vbench` / `trace` / `random`).
+- `--num-prompts`: Number of requests to send.
+Common optional flags:
+- `--output-file`: Write metrics to a JSON file.
+- `--disable-tqdm`: Disable the progress bar.
+### Resolution / frames / steps: CLI defaults vs dataset fields
+Related flags: `--width`, `--height`, `--num-frames`, `--fps`, `--num-inference-steps`.
+- For `vbench` / `random`: these CLI flags act as global defaults for all generated requests.
+- For `trace`: each request can carry its own fields (e.g., `width/height/num_frames/num_inference_steps`).
+Precedence rules for `trace` (i.e., what actually gets sent):
+- `width/height`: if either `--width` or `--height` is explicitly set, it overrides per-request values from the trace; otherwise per-request values are used when present.
+- `num_frames`: per-request `num_frames` takes precedence; otherwise fall back to `--num-frames`.
+- `num_inference_steps`: per-request `num_inference_steps` takes precedence; otherwise fall back to `--num-inference-steps`.
+### SLO, warmup, and max concurrency
+Enable SLO evaluation with `--slo`.
+- If a request in the trace already has `slo_ms`, that value is used.
+- Otherwise, the script runs warmup requests to infer a base unit time, estimates `expected_ms` by linearly scaling with area/frames/steps, and then sets `slo_ms = expected_ms * --slo-scale`.
+Warmup flags:
+- `--warmup-requests`: Number of warmup requests.
+- `--warmup-num-inference-steps`: Steps used during warmup.
+- For `--task t2v`: warmup requests are forced to use `num_frames=1` to make warmup faster and less noisy.
+Traffic / concurrency flags:
+- `--request-rate`: Target request rate (requests/second). If set to `inf`, the script sends all requests immediately.
+- `--max-concurrency`: Max number of in-flight requests (default: `1`). This can hard-cap the achieved QPS: if it is too small, requests will queue behind the semaphore, and both achieved throughput and observed SLO attainment can be skewed.
--- a/benchmarks/diffusion/diffusion_benchmark_serving.py
+++ b/benchmarks/diffusion/diffusion_benchmark_serving.py
+# adapted from sglang and fastvideo
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark online serving for diffusion models (Image/Video Generation).
+If you want to use i2v, i2i dataset, you should `uv pip install gdown` first
+Usage:
+    # Video
+    t2v:
+    python3 benchmarks/diffusion/diffusion_benchmark_serving.py \
+        --dataset vbench --task t2v --num-prompts 10 \
+        --height 480 --width 640 --fps 16 --num-frames 80
+    i2v:
+    python3 benchmarks/diffusion/diffusion_benchmark_serving.py \
+        --dataset vbench --task i2v --num-prompts 10
+    # Image
+    t2i:
+    python3 benchmarks/diffusion/diffusion_benchmark_serving.py \
+        --dataset vbench --task t2i --num-prompts 10 \
+        --height 1024 --width 1024
+    i2i:
+    python3 benchmarks/diffusion/diffusion_benchmark_serving.py \
+        --dataset vbench --task i2i --num-prompts 10
+"""
+import argparse
+import ast
+import asyncio
+import base64
+import glob
+import json
+import mimetypes
+import os
+import time
+import uuid
+from abc import ABC, abstractmethod
+from collections.abc import AsyncGenerator
+from dataclasses import dataclass, field, replace
+from typing import Any
+import aiohttp
+import numpy as np
+import requests
+from tqdm.asyncio import tqdm
+@dataclass
+class RequestFuncInput:
+    prompt: str
+    api_url: str
+    model: str
+    width: int | None = None
+    height: int | None = None
+    num_frames: int | None = None
+    num_inference_steps: int | None = None
+    seed: int | None = None
+    fps: int | None = None
+    timestamp: float | None = None
+    slo_ms: float | None = None
+    extra_body: dict[str, Any] = field(default_factory=dict)
+    image_paths: list[str] | None = None
+    request_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+@dataclass
+class RequestFuncOutput:
+    success: bool = False
+    latency: float = 0.0
+    error: str = ""
+    start_time: float = 0.0
+    response_body: dict[str, Any] = field(default_factory=dict)
+    peak_memory_mb: float = 0.0
+    slo_achieved: bool | None = None
+class BaseDataset(ABC):
+    def __init__(self, args, api_url: str, model: str):
+        self.args = args
+        self.api_url = api_url
+        self.model = model
+    @abstractmethod
+    def __len__(self) -> int:
+        pass
+    @abstractmethod
+    def __getitem__(self, idx: int) -> RequestFuncInput:
+        pass
+    @abstractmethod
+    def get_requests(self) -> list[RequestFuncInput]:
+        pass
+class VBenchDataset(BaseDataset):
+    """
+    Dataset loader for VBench prompts.
+    Supports t2v, i2v.
+    """
+    T2V_PROMPT_URL = (
+        "https://raw.githubusercontent.com/Vchitect/VBench/master/prompts/prompts_per_dimension/subject_consistency.txt"
+    )
+    I2V_DOWNLOAD_SCRIPT_URL = (
+        "https://raw.githubusercontent.com/Vchitect/VBench/master/vbench2_beta_i2v/download_data.sh"
+    )
+    def __init__(self, args, api_url: str, model: str):
+        super().__init__(args, api_url, model)
+        self.cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "vllm-omni")
+        self.items = self._load_data()
+    def _load_data(self) -> list[dict[str, Any]]:
+        if self.args.task == "t2v":
+            return self._load_t2v_prompts()
+        elif self.args.task in ["i2v", "ti2v", "ti2i", "i2i"]:
+            return self._load_i2v_data()
+        else:
+            return self._load_t2v_prompts()
+    def _download_file(self, url: str, dest_path: str) -> None:
+        """Download a file from URL to destination path."""
+        os.makedirs(os.path.dirname(dest_path), exist_ok=True)
+        resp = requests.get(url)
+        resp.raise_for_status()
+        with open(dest_path, "w") as f:
+            f.write(resp.text)
+    def _load_t2v_prompts(self) -> list[dict[str, Any]]:
+        path = self.args.dataset_path
+        if not path:
+            path = os.path.join(self.cache_dir, "vbench_subject_consistency.txt")
+            if not os.path.exists(path):
+                print(f"Downloading VBench T2V prompts to {path}...")
+                try:
+                    self._download_file(self.T2V_PROMPT_URL, path)
+                except Exception as e:
+                    print(f"Failed to download VBench prompts: {e}")
+                    return [{"prompt": "A cat sitting on a bench"}] * 50
+        prompts = []
+        with open(path) as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    prompts.append({"prompt": line})
+        return self._resize_data(prompts)
+    def _auto_download_i2v_dataset(self) -> str:
+        """Auto-download VBench I2V dataset and return the dataset directory."""
+        vbench_i2v_dir = os.path.join(self.cache_dir, "vbench_i2v", "vbench2_beta_i2v")
+        info_json_path = os.path.join(vbench_i2v_dir, "data", "i2v-bench-info.json")
+        if os.path.exists(info_json_path):
+            return vbench_i2v_dir
+        print(f"Downloading VBench I2V dataset to {vbench_i2v_dir}...")
+        try:
+            cache_root = os.path.join(self.cache_dir, "vbench_i2v")
+            script_path = os.path.join(cache_root, "download_data.sh")
+            self._download_file(self.I2V_DOWNLOAD_SCRIPT_URL, script_path)
+            os.chmod(script_path, 0o755)
+            print("Executing download_data.sh (this may take a while)...")
+            import subprocess
+            result = subprocess.run(
+                ["bash", script_path],
+                cwd=cache_root,
+                capture_output=True,
+                text=True,
+            )
+            if result.returncode != 0:
+                raise RuntimeError(f"Download script failed: {result.stderr}")
+            print(f"Successfully downloaded VBench I2V dataset to {vbench_i2v_dir}")
+        except Exception as e:
+            print(f"Failed to download VBench I2V dataset: {e}")
+            print("Please manually download following instructions at:")
+            print("https://github.com/Vchitect/VBench/tree/master/vbench2_beta_i2v#22-download")
+            return None
+        return vbench_i2v_dir if os.path.exists(info_json_path) else None
+    def _load_from_i2v_json(self, json_path: str) -> list[dict[str, Any]]:
+        """Load I2V data from i2v-bench-info.json format."""
+        with open(json_path) as f:
+            items = json.load(f)
+        base_dir = os.path.dirname(os.path.dirname(json_path))  # Go up to vbench2_beta_i2v
+        origin_dir = os.path.join(base_dir, "data", "origin")
+        data = []
+        for item in items:
+            img_path = os.path.join(origin_dir, item.get("file_name", ""))
+            if os.path.exists(img_path):
+                data.append({"prompt": item.get("caption", ""), "image_path": img_path})
+            else:
+                print(f"Warning: Image not found: {img_path}")
+        print(f"Loaded {len(data)} I2V samples from VBench I2V dataset")
+        return data
+    def _scan_directory_for_images(self, path: str) -> list[dict[str, Any]]:
+        """Scan directory for image files."""
+        exts = ["*.jpg", "*.jpeg", "*.png", "*.webp"]
+        files = []
+        for ext in exts:
+            files.extend(glob.glob(os.path.join(path, ext)))
+            files.extend(glob.glob(os.path.join(path, ext.upper())))
+            # Also check in data/origin subdirectory
+            origin_dir = os.path.join(path, "data", "origin")
+            if os.path.exists(origin_dir):
+                files.extend(glob.glob(os.path.join(origin_dir, ext)))
+                files.extend(glob.glob(os.path.join(origin_dir, ext.upper())))
+        return [{"prompt": os.path.splitext(os.path.basename(f))[0], "image_path": f} for f in files]
+    def _create_dummy_data(self) -> list[dict[str, Any]]:
+        """Create dummy data with a placeholder image in cache directory."""
+        print("No I2V data found. Using dummy placeholders.")
+        dummy_image = os.path.join(self.cache_dir, "dummy_image.jpg")
+        if not os.path.exists(dummy_image):
+            try:
+                from PIL import Image
+                os.makedirs(self.cache_dir, exist_ok=True)
+                img = Image.new("RGB", (100, 100), color="red")
+                img.save(dummy_image)
+                print(f"Created dummy image at {dummy_image}")
+            except ImportError:
+                print("PIL not installed, cannot create dummy image.")
+                return []
+        return [{"prompt": "A moving cat", "image_path": dummy_image}] * 10
+    def _load_i2v_data(self) -> list[dict[str, Any]]:
+        """Load I2V data from VBench I2V dataset or user-provided path."""
+        path = self.args.dataset_path
+        # Auto-download if no path provided
+        if not path:
+            path = self._auto_download_i2v_dataset()
+            if not path:
+                return self._resize_data(self._create_dummy_data())
+        # Try to load from i2v-bench-info.json
+        info_json_candidates = [
+            os.path.join(path, "data", "i2v-bench-info.json"),
+            path if path.endswith(".json") else None,
+        ]
+        for json_path in info_json_candidates:
+            if json_path and os.path.exists(json_path):
+                try:
+                    return self._resize_data(self._load_from_i2v_json(json_path))
+                except Exception as e:
+                    print(f"Failed to load {json_path}: {e}")
+        # Fallback: scan directory for images
+        if os.path.isdir(path):
+            data = self._scan_directory_for_images(path)
+            if data:
+                return self._resize_data(data)
+        # Last resort: dummy data
+        return self._resize_data(self._create_dummy_data())
+    def _resize_data(self, data: list[dict[str, Any]]) -> list[dict[str, Any]]:
+        """Resize data to match num_prompts."""
+        if not self.args.num_prompts:
+            return data
+        if len(data) < self.args.num_prompts:
+            factor = (self.args.num_prompts // len(data)) + 1
+            data = data * factor
+        return data[: self.args.num_prompts]
+    def __len__(self) -> int:
+        return len(self.items)
+    def __getitem__(self, idx: int) -> RequestFuncInput:
+        item = self.items[idx]
+        image_paths = [item["image_path"]] if "image_path" in item else None
+        return RequestFuncInput(
+            prompt=item.get("prompt", ""),
+            api_url=self.api_url,
+            model=self.model,
+            width=self.args.width,
+            height=self.args.height,
+            num_frames=self.args.num_frames,
+            num_inference_steps=self.args.num_inference_steps,
+            seed=self.args.seed,
+            fps=self.args.fps,
+            image_paths=image_paths,
+        )
+    def get_requests(self) -> list[RequestFuncInput]:
+        return [self[i] for i in range(len(self))]
+class TraceDataset(BaseDataset):
+    """Trace-based dataset loader for heterogeneous diffusion requests."""
+    DEFAULT_REPO_ID = "asukaqaqzz/Dit_Trace"
+    DEFAULT_FILENAME = "sd3_trace.txt"
+    DEFAULT_FILENAME_BY_TASK: dict[str, str] = {
+        # Text-to-image traces (e.g., SD3)
+        "t2i": "sd3_trace.txt",
+        # Text-to-video traces (e.g., CogVideoX)
+        "t2v": "cogvideox_trace.txt",
+    }
+    def __init__(self, args, api_url: str, model: str):
+        super().__init__(args, api_url, model)
+        self.cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "vllm-omni", "trace")
+        self.default_filename = self.DEFAULT_FILENAME_BY_TASK.get(getattr(args, "task", ""), self.DEFAULT_FILENAME)
+        dataset_root = args.dataset_path
+        if not dataset_root:
+            dataset_root = self._download_default_trace()
+        self.items = self._load_items(dataset_root)
+    @staticmethod
+    def _coerce_int(x: Any) -> int | None:
+        if x is None:
+            return None
+        if isinstance(x, bool):
+            return None
+        if isinstance(x, int):
+            return x
+        try:
+            s = str(x).strip()
+            if not s:
+                return None
+            return int(float(s))
+        except Exception:
+            return None
+    @staticmethod
+    def _coerce_float(x: Any) -> float | None:
+        if x is None:
+            return None
+        if isinstance(x, float):
+            return x
+        if isinstance(x, int):
+            return float(x)
+        try:
+            s = str(x).strip()
+            if not s:
+                return None
+            return float(s)
+        except Exception:
+            return None
+    def _download_default_trace(self) -> str:
+        """Download default trace file from HuggingFace Hub if not provided."""
+        try:
+            from huggingface_hub import hf_hub_download
+        except ImportError as exc:
+            raise ImportError(
+                "huggingface_hub is required to download the default trace dataset. "
+                "Install via `pip install huggingface_hub`."
+            ) from exc
+        os.makedirs(self.cache_dir, exist_ok=True)
+        return hf_hub_download(
+            repo_id=self.DEFAULT_REPO_ID,
+            filename=self.default_filename,
+            repo_type="dataset",
+            local_dir=self.cache_dir,
+            local_dir_use_symlinks=False,
+        )
+    def _expand_paths(self, dataset_path: str | None) -> list[str]:
+        if not dataset_path:
+            return []
+        parts = [p.strip() for p in str(dataset_path).split(",") if p.strip()]
+        paths: list[str] = []
+        for p in parts:
+            if any(ch in p for ch in ["*", "?", "["]):
+                paths.extend(sorted(glob.glob(p)))
+            elif os.path.isdir(p):
+                paths.extend(sorted(glob.glob(os.path.join(p, "**", "*.txt"), recursive=True)))
+            else:
+                paths.append(p)
+        seen = set()
+        unique_paths = []
+        for p in paths:
+            if p not in seen:
+                seen.add(p)
+                unique_paths.append(p)
+        return unique_paths
+    def _parse_trace_file(self, path: str) -> list[dict[str, Any]]:
+        rows: list[dict[str, Any]] = []
+        def parse_request_repr_line(line: str) -> dict[str, Any] | None:
+            text = line.strip()
+            if not text:
+                return None
+            if not (text.startswith("Request(") and text.endswith(")")):
+                return None
+            inner = text[len("Request(") : -1]
+            try:
+                expr = ast.parse(f"f({inner})", mode="eval")
+                if not isinstance(expr.body, ast.Call):
+                    return None
+                call = expr.body
+                out: dict[str, Any] = {}
+                for kw in call.keywords:
+                    if kw.arg is None:
+                        continue
+                    out[kw.arg] = ast.literal_eval(kw.value)
+                return out
+            except Exception:
+                return None
+        # detect first non-empty line to pick parser
+        first_non_empty = None
+        with open(path, encoding="utf-8") as f:
+            for _ in range(50):
+                pos = f.tell()
+                line = f.readline()
+                if not line:
+                    break
+                if line.strip():
+                    first_non_empty = line.strip()
+                    f.seek(pos)
+                    break
+        if first_non_empty is None:
+            return rows
+        if first_non_empty.startswith("Request("):
+            with open(path, encoding="utf-8") as f:
+                for line in f:
+                    parsed = parse_request_repr_line(line)
+                    if isinstance(parsed, dict):
+                        rows.append(parsed)
+            return rows
+        # txt fallback: parse Request(...) lines only
+        with open(path, encoding="utf-8") as f:
+            for line in f:
+                parsed = parse_request_repr_line(line)
+                if isinstance(parsed, dict):
+                    rows.append(parsed)
+        return rows
+    def _load_items(self, dataset_root: str) -> list[dict[str, Any]]:
+        paths = self._expand_paths(dataset_root)
+        if not paths:
+            raise ValueError("No trace files found. Provide --dataset-path or rely on default HuggingFace download.")
+        items: list[dict[str, Any]] = []
+        for p in paths:
+            if not os.path.exists(p):
+                continue
+            for row in self._parse_trace_file(p):
+                if isinstance(row, dict):
+                    row = dict(row)
+                    row.setdefault("_source", p)
+                    items.append(row)
+        if not items:
+            raise ValueError("Trace dataset is empty after parsing provided paths.")
+        if self.args.num_prompts is not None:
+            items = items[: self.args.num_prompts]
+        return items
+    def __len__(self) -> int:
+        return len(self.items)
+    def __getitem__(self, idx: int) -> RequestFuncInput:
+        row = self.items[idx]
+        prompt = row.get("prompt") or row.get("text") or ""
+        row_height = self._coerce_int(row.get("height"))
+        row_width = self._coerce_int(row.get("width"))
+        num_frames = self._coerce_int(row.get("num_frames"))
+        num_steps = self._coerce_int(row.get("num_inference_steps"))
+        seed = self._coerce_int(row.get("seed"))
+        fps = self._coerce_int(row.get("fps"))
+        timestamp = self._coerce_float(row.get("timestamp"))
+        slo_ms = self._coerce_float(row.get("slo_ms"))
+        image_paths = row.get("image_paths")
+        override_w = self.args.width
+        override_h = self.args.height
+        if override_w is not None or override_h is not None:
+            width = override_w
+            height = override_h
+        else:
+            width = row_width
+            height = row_height
+        return RequestFuncInput(
+            prompt=str(prompt),
+            api_url=self.api_url,
+            model=self.model,
+            width=width,
+            height=height,
+            num_frames=num_frames if num_frames is not None else self.args.num_frames,
+            num_inference_steps=num_steps if num_steps is not None else self.args.num_inference_steps,
+            seed=seed if seed is not None else self.args.seed,
+            fps=fps if fps is not None else self.args.fps,
+            timestamp=timestamp,
+            slo_ms=slo_ms,
+            image_paths=image_paths,
+            request_id=str(row.get("request_id")) if row.get("request_id") is not None else str(uuid.uuid4()),
+        )
+    def get_requests(self) -> list[RequestFuncInput]:
+        return [self[i] for i in range(len(self))]
+class RandomDataset(BaseDataset):
+    def __init__(self, args, api_url: str, model: str):
+        self.args = args
+        self.api_url = api_url
+        self.model = model
+        self.num_prompts = args.num_prompts
+    def __len__(self) -> int:
+        return self.num_prompts
+    def __getitem__(self, idx: int) -> RequestFuncInput:
+        return RequestFuncInput(
+            prompt=f"Random prompt {idx} for benchmarking diffusion models",
+            api_url=self.api_url,
+            model=self.model,
+            width=self.args.width,
+            height=self.args.height,
+            num_frames=self.args.num_frames,
+            num_inference_steps=self.args.num_inference_steps,
+            seed=self.args.seed,
+            fps=self.args.fps,
+        )
+    def get_requests(self) -> list[RequestFuncInput]:
+        return [self[i] for i in range(len(self))]
+def _compute_expected_latency_ms_from_base(req: RequestFuncInput, args, base_time_ms: float | None) -> float | None:
+    """Compute expected execution time (ms) based on a base per-step-per-frame unit time.
+    Assumes linear scaling with pixel area, frame count, and num_inference_steps.
+    The base unit represents latency for a 16x16 resolution, single frame, single step.
+    """
+    if base_time_ms is None:
+        return None
+    width = req.width if req.width is not None else args.width
+    height = req.height if req.height is not None else args.height
+    if width is None or height is None:
+        return None
+    frames = req.num_frames if req.num_frames is not None else args.num_frames
+    steps = req.num_inference_steps if req.num_inference_steps is not None else args.num_inference_steps
+    frame_scale = frames if isinstance(frames, int) and frames > 0 else 1
+    step_scale = steps if isinstance(steps, int) and steps > 0 else 1
+    area_units = max((float(width) * float(height)) / float(16 * 16), 1.0)
+    return float(base_time_ms) * area_units * frame_scale * step_scale
+def _infer_slo_base_time_ms_from_warmups(
+    warmup_pairs: list[tuple[RequestFuncInput, RequestFuncOutput]],
+    args,
+) -> float | None:
+    """Infer base SLO unit time from warmup requests.
+    Returns the median base latency (ms) for a 16x16 resolution, single-frame,
+    single-step request. Only uses warmups that succeeded and have resolvable
+    width/height.
+    """
+    candidates_ms: list[float] = []
+    for req, out in warmup_pairs:
+        if not out.success or out.latency <= 0:
+            continue
+        width = req.width if req.width is not None else args.width
+        height = req.height if req.height is not None else args.height
+        if width is None or height is None:
+            continue
+        frames = req.num_frames if req.num_frames is not None else args.num_frames
+        steps = req.num_inference_steps if req.num_inference_steps is not None else args.num_inference_steps
+        frame_scale = int(frames) if isinstance(frames, int) and frames > 0 else 1
+        step_scale = int(steps) if isinstance(steps, int) and steps > 0 else 1
+        area_units = max((float(width) * float(height)) / float(16 * 16), 1.0)
+        denom = area_units * float(frame_scale) * float(step_scale)
+        if denom <= 0:
+            continue
+        candidates_ms.append((out.latency * 1000.0) / denom)
+    if not candidates_ms:
+        return None
+    return float(np.median(candidates_ms))
+def _populate_slo_ms_from_warmups(
+    requests_list: list[RequestFuncInput],
+    warmup_pairs: list[tuple[RequestFuncInput, RequestFuncOutput]],
+    args,
+) -> list[RequestFuncInput]:
+    """Populate missing RequestFuncInput.slo_ms using warmup outputs.
+    - If a request already has slo_ms (e.g., trace-provided), it is kept as-is.
+    - If any request has slo_ms is None and we can infer base time from warmups,
+      we estimate each missing request's expected execution time and set:
+        req.slo_ms = expected_latency_ms * args.slo_scale
+    Returns updated requests_list.
+    """
+    if not any(req.slo_ms is None for req in requests_list):
+        return requests_list
+    base_time_ms = _infer_slo_base_time_ms_from_warmups(warmup_pairs, args)
+    if base_time_ms is None:
+        return requests_list
+    slo_scale = float(getattr(args, "slo_scale", 3.0))
+    if slo_scale <= 0:
+        raise ValueError(f"slo_scale must be positive, got {slo_scale}.")
+    updated: list[RequestFuncInput] = []
+    for req in requests_list:
+        if req.slo_ms is not None:
+            updated.append(req)
+            continue
+        expected_ms = _compute_expected_latency_ms_from_base(req, args, base_time_ms)
+        updated.append(replace(req, slo_ms=(expected_ms * slo_scale) if expected_ms is not None else None))
+    return updated
+async def iter_requests(
+    requests_list: list[RequestFuncInput],
+    request_rate: float,
+) -> AsyncGenerator[RequestFuncInput, None]:
+    """Yield requests using a fixed interval if request_rate is set.
+    - If request_rate is inf, all requests are yielded immediately (no sleep).
+    - Otherwise, requests are emitted at a fixed cadence of 1 / request_rate seconds.
+    """
+    if request_rate != float("inf"):
+        if request_rate <= 0:
+            raise ValueError(f"request_rate must be positive or inf, got {request_rate}.")
+        interval_s = 1.0 / float(request_rate)
+    for i, req in enumerate(requests_list):
+        if request_rate != float("inf") and i > 0:
+            await asyncio.sleep(interval_s)
+        yield req
+def _guess_mime_type(path: str) -> str:
+    mime, _ = mimetypes.guess_type(path)
+    return mime or "application/octet-stream"
+def _encode_image_as_data_url(path: str) -> str:
+    with open(path, "rb") as f:
+        encoded = base64.b64encode(f.read()).decode("utf-8")
+    mime = _guess_mime_type(path)
+    return f"data:{mime};base64,{encoded}"
+async def async_request_chat_completions(
+    input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: tqdm | None = None,
+) -> RequestFuncOutput:
+    output = RequestFuncOutput()
+    output.start_time = time.perf_counter()
+    extra_body = dict(input.extra_body)
+    if input.width and input.height:
+        extra_body.setdefault("height", input.height)
+        extra_body.setdefault("width", input.width)
+    if input.num_frames:
+        extra_body.setdefault("num_frames", input.num_frames)
+    if input.num_inference_steps:
+        extra_body.setdefault("num_inference_steps", input.num_inference_steps)
+    if input.seed is not None:
+        extra_body.setdefault("seed", input.seed)
+    if input.fps:
+        extra_body.setdefault("fps", input.fps)
+    if input.image_paths and len(input.image_paths) > 0:
+        content = []
+        if input.prompt:
+            content.append({"type": "text", "text": input.prompt})
+        for img_path in input.image_paths:
+            if not os.path.exists(img_path):
+                output.error = f"Image file not found: {img_path}"
+                output.success = False
+                if pbar:
+                    pbar.update(1)
+                return output
+            content.append(
+                {
+                    "type": "image_url",
+                    "image_url": {"url": _encode_image_as_data_url(img_path)},
+                }
+            )
+        messages = [{"role": "user", "content": content}]
+    else:
+        messages = [{"role": "user", "content": input.prompt}]
+    payload = {
+        "model": input.model,
+        "messages": messages,
+    }
+    if extra_body:
+        payload["extra_body"] = extra_body
+    try:
+        async with session.post(input.api_url, json=payload) as response:
+            if response.status == 200:
+                resp_json = await response.json()
+                output.response_body = resp_json
+                output.success = True
+                if "peak_memory_mb" in resp_json:
+                    output.peak_memory_mb = resp_json["peak_memory_mb"]
+            else:
+                output.error = f"HTTP {response.status}: {await response.text()}"
+                output.success = False
+    except Exception as e:
+        output.error = str(e)
+        output.success = False
+    output.latency = time.perf_counter() - output.start_time
+    if output.success and input.slo_ms is not None:
+        output.slo_achieved = (output.latency * 1000.0) <= float(input.slo_ms)
+    if pbar:
+        pbar.update(1)
+    return output
+def calculate_metrics(
+    outputs: list[RequestFuncOutput],
+    total_duration: float,
+    requests_list: list[RequestFuncInput],
+    args,
+    slo_enabled: bool,
+):
+    success_outputs = [o for o in outputs if o.success]
+    error_outputs = [o for o in outputs if not o.success]
+    num_success = len(success_outputs)
+    latencies = [o.latency for o in success_outputs]
+    peak_memories = [o.peak_memory_mb for o in success_outputs if o.peak_memory_mb > 0]
+    metrics = {
+        "duration": total_duration,
+        "completed_requests": num_success,
+        "failed_requests": len(error_outputs),
+        "throughput_qps": num_success / total_duration if total_duration > 0 else 0,
+        "latency_mean": np.mean(latencies) if latencies else 0,
+        "latency_median": np.median(latencies) if latencies else 0,
+        "latency_p99": np.percentile(latencies, 99) if latencies else 0,
+        "latency_p50": np.percentile(latencies, 50) if latencies else 0,
+        "peak_memory_mb_max": max(peak_memories) if peak_memories else 0,
+        "peak_memory_mb_mean": np.mean(peak_memories) if peak_memories else 0,
+        "peak_memory_mb_median": np.median(peak_memories) if peak_memories else 0,
+    }
+    if slo_enabled:
+        slo_defined_total = 0
+        slo_met_success = 0
+        for req, out in zip(requests_list, outputs):
+            if req.slo_ms is None:
+                continue
+            slo_defined_total += 1
+            if out.slo_achieved is None:
+                continue
+            if out.slo_achieved:
+                slo_met_success += 1
+        slo_attain_all = (slo_met_success / slo_defined_total) if slo_defined_total > 0 else 0.0
+        metrics.update(
+            {
+                "slo_attainment_rate": slo_attain_all,
+                "slo_met_success": slo_met_success,
+                "slo_scale": getattr(args, "slo_scale", 3.0),
+            }
+        )
+    return metrics
+def wait_for_service(base_url: str, timeout: int = 120) -> None:
+    print(f"Waiting for service at {base_url}...")
+    start_time = time.time()
+    while True:
+        try:
+            # Try /health endpoint first
+            resp = requests.get(f"{base_url}/health", timeout=1)
+            if resp.status_code == 200:
+                print("Service is ready.")
+                break
+        except requests.exceptions.RequestException:
+            pass
+        if time.time() - start_time > timeout:
+            raise TimeoutError(f"Service at {base_url} did not start within {timeout} seconds.")
+        time.sleep(1)
+async def benchmark(args):
+    # Construct base_url if not provided
+    if args.base_url is None:
+        args.base_url = f"http://{args.host}:{args.port}"
+    # Setup dataset (vLLM-Omni supports diffusion via /v1/chat/completions)
+    api_url = f"{args.base_url}/v1/chat/completions"
+    request_func = async_request_chat_completions
+    if args.dataset == "vbench":
+        dataset = VBenchDataset(args, api_url, args.model)
+    elif args.dataset == "trace":
+        dataset = TraceDataset(args, api_url, args.model)
+    elif args.dataset == "random":
+        dataset = RandomDataset(args, api_url, args.model)
+    else:
+        raise ValueError(f"Unknown dataset: {args.dataset}")
+    print("Loading requests...")
+    requests_list = dataset.get_requests()
+    print(f"Prepared {len(requests_list)} requests from {args.dataset} dataset.")
+    # Limit concurrency
+    if args.max_concurrency is not None:
+        semaphore = asyncio.Semaphore(args.max_concurrency)
+    else:
+        semaphore = None
+    async def limited_request_func(req, session, pbar):
+        if semaphore:
+            async with semaphore:
+                return await request_func(req, session, pbar)
+        else:
+            return await request_func(req, session, pbar)
+    # Run benchmark
+    pbar = tqdm(total=len(requests_list), disable=args.disable_tqdm)
+    async with aiohttp.ClientSession() as session:
+        warmup_pairs: list[tuple[RequestFuncInput, RequestFuncOutput]] = []
+        if args.warmup_requests and requests_list:
+            print(
+                f"Running {args.warmup_requests} warmup request(s) \
+                with num_inference_steps={args.warmup_num_inference_steps}..."
+            )
+            for i in range(args.warmup_requests):
+                warm_req = requests_list[i % len(requests_list)]
+                if args.warmup_num_inference_steps is not None:
+                    warm_req = replace(
+                        warm_req,
+                        num_inference_steps=args.warmup_num_inference_steps,
+                    )
+                warm_out = await limited_request_func(warm_req, session, None)
+                warmup_pairs.append((warm_req, warm_out))
+        if args.slo:
+            # Prefer trace-provided per-request slo_ms. Only populate when missing.
+            requests_list = _populate_slo_ms_from_warmups(
+                requests_list=requests_list,
+                warmup_pairs=warmup_pairs,
+                args=args,
+            )
+        start_time = time.perf_counter()
+        tasks = []
+        async for req in iter_requests(requests_list=requests_list, request_rate=args.request_rate):
+            task = asyncio.create_task(limited_request_func(req, session, pbar))
+            tasks.append(task)
+        outputs = await asyncio.gather(*tasks)
+        total_duration = time.perf_counter() - start_time
+    pbar.close()
+    # Calculate metrics
+    metrics = calculate_metrics(outputs, total_duration, requests_list, args, args.slo)
+    print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=60, c="="))
+    # Section 1: Configuration
+    print("{:<40} {:<15}".format("Model:", args.model))
+    print("{:<40} {:<15}".format("Dataset:", args.dataset))
+    print("{:<40} {:<15}".format("Task:", args.task))
+    # Section 2: Execution & Traffic
+    print(f"{'-' * 50}")
+    print("{:<40} {:<15.2f}".format("Benchmark duration (s):", metrics["duration"]))
+    print("{:<40} {:<15}".format("Request rate:", str(args.request_rate)))
+    print(
+        "{:<40} {:<15}".format(
+            "Max request concurrency:",
+            str(args.max_concurrency) if args.max_concurrency else "not set",
+        )
+    )
+    print("{:<40} {}/{:<15}".format("Successful requests:", metrics["completed_requests"], len(requests_list)))
+    # Section 3: Performance Metrics
+    print(f"{'-' * 50}")
+    print("{:<40} {:<15.2f}".format("Request throughput (req/s):", metrics["throughput_qps"]))
+    print("{:<40} {:<15.4f}".format("Latency Mean (s):", metrics["latency_mean"]))
+    print("{:<40} {:<15.4f}".format("Latency Median (s):", metrics["latency_median"]))
+    print("{:<40} {:<15.4f}".format("Latency P99 (s):", metrics["latency_p99"]))
+    if args.slo:
+        print(f"{'-' * 50}")
+        print("{:<40} {:<15.2%}".format("SLO Attainment Rate (all):", metrics.get("slo_attainment_rate", 0.0)))
+        print("{:<40} {:<15}".format("SLO Met (success count):", str(metrics.get("slo_met_success", 0))))
+        print("{:<40} {:<15}".format("SLO Scale:", str(metrics.get("slo_scale", 3.0))))
+    if metrics["peak_memory_mb_max"] > 0:
+        print(f"{'-' * 50}")
+        print("{:<40} {:<15.2f}".format("Peak Memory Max (MB):", metrics["peak_memory_mb_max"]))
+        print("{:<40} {:<15.2f}".format("Peak Memory Mean (MB):", metrics["peak_memory_mb_mean"]))
+        print("{:<40} {:<15.2f}".format("Peak Memory Median (MB):", metrics["peak_memory_mb_median"]))
+    print("\n" + "=" * 60)
+    if args.output_file:
+        with open(args.output_file, "w") as f:
+            json.dump(metrics, f, indent=2)
+        print(f"Metrics saved to {args.output_file}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Benchmark serving for diffusion models.")
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default=None,
+        help="Base URL of the server (e.g., http://localhost:8091). Overrides host/port.",
+    )
+    parser.add_argument("--host", type=str, default="localhost", help="Server host.")
+    parser.add_argument("--port", type=int, default=8091, help="Server port.")
+    parser.add_argument("--model", type=str, default="default", help="Model name.")
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="vbench",
+        choices=["vbench", "trace", "random"],
+        help="Dataset to use.",
+    )
+    parser.add_argument(
+        "--task",
+        type=str,
+        default="t2v",
+        choices=["t2v", "i2v", "ti2v", "ti2i", "i2i", "t2i"],
+        help="Task type.",
+    )
+    parser.add_argument(
+        "--dataset-path",
+        type=str,
+        default=None,
+        help="Path to local dataset file (optional).",
+    )
+    parser.add_argument("--num-prompts", type=int, default=10, help="Number of prompts to benchmark.")
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        default=1,
+        help="Maximum number of concurrent requests, default to `1`. This can be used "
+        "to help simulate an environment where a higher level component "
+        "is enforcing a maximum number of concurrent requests. While the "
+        "--request-rate argument controls the rate at which requests are "
+        "initiated, this argument will control how many are actually allowed "
+        "to execute at a time. This means that when used in combination, the "
+        "actual request rate may be lower than specified with --request-rate, "
+        "if the server is not processing requests fast enough to keep up.",
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=float("inf"),
+        help="Number of requests per second. If this is inf, then all the requests are sent at time 0. "
+        "Otherwise, we use Poisson process to synthesize the request arrival times. Default is inf.",
+    )
+    parser.add_argument(
+        "--warmup-requests",
+        type=int,
+        default=1,
+        help="Number of warmup requests to run before measurement.",
+    )
+    parser.add_argument(
+        "--warmup-num-inference-steps",
+        type=int,
+        default=1,
+        help="num_inference_steps used for warmup requests.",
+    )
+    parser.add_argument("--width", type=int, default=None, help="Image/Video width.")
+    parser.add_argument("--height", type=int, default=None, help="Image/Video height.")
+    parser.add_argument("--num-frames", type=int, default=None, help="Number of frames (for video).")
+    parser.add_argument(
+        "--num-inference-steps",
+        type=int,
+        default=50,
+        help="Number of inference steps (for diffusion models).",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Random seed (for diffusion models).",
+    )
+    parser.add_argument("--fps", type=int, default=None, help="FPS (for video).")
+    parser.add_argument("--output-file", type=str, default=None, help="Output JSON file for metrics.")
+    parser.add_argument(
+        "--slo",
+        action="store_true",
+        help=(
+            "Enable SLO calculation and reporting. If trace provides per-request slo_ms, it is used. "
+            "Otherwise, warmup request(s) are used to infer expected execution time assuming linear "
+            "scaling by resolution, frames, and steps, then slo_ms = expected_time * --slo-scale."
+        ),
+    )
+    parser.add_argument(
+        "--slo-scale",
+        type=float,
+        default=3.0,
+        help="SLO target multiplier: slo_ms = estimated_exec_time_ms * slo_scale (default: 3).",
+    )
+    parser.add_argument("--disable-tqdm", action="store_true", help="Disable progress bar.")
+    args = parser.parse_args()
+    asyncio.run(benchmark(args))
--- a/benchmarks/qwen3-omni/README.md
+++ b/benchmarks/qwen3-omni/README.md
+# Benchmarks Guide
+This README explains how to (1) prepare benchmark datasets and (2) run the provided Qwen3-Omni benchmarks.
+## 1) Prepare the dataset (SeedTTS top100)
+```bash
+cd benchmarks/build_dataset
+pip install gdown
+# Download SeedTTS test set from Google Drive
+gdown --id 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP
+# Extract
+tar -xf seedtts_testset.tar
+# Copy metadata and extract top-100 prompts
+cp seedtts_testset/en/meta.lst meta.lst
+python extract_prompts.py -i meta.lst -o top100.txt -n 100
+# (Optional) clean up to save space
+rm -rf seedtts_testset seedtts_testset.tar meta.lst
+```
+Artifacts:
+- `benchmarks/build_dataset/top100.txt` — 100 text prompts (one per line).
+## 2) Run benchmarks
+All commands assume repo root (`vllm-omni`).
+### A. Transformers benchmark (offline, HF Transformers)
+```
+bash benchmarks/qwen3-omni/transformers/eval_qwen3_moe_omni_transformers.sh
+```
+What it does:
+- Runs `qwen3_omni_moe_transformers.py` over `top100.txt` with `--num_prompts 100`.
+- Outputs to `benchmarks/qwen3-omni/transformers/benchmark_results/`:
+  - `perf_stats.json` — aggregated & per-prompt TPS/latency (thinker/talker/code2wav/overall).
+  - `results.json` — per-prompt outputs and audio paths.
+  - `audio/` — ~100 generated `.wav` files.
+Key checks:
+- `overall_tps` and `*_tps_avg` should be non-zero and reasonably stable.
+- Investigate any 0/NaN or unusually low TPS / long-tail latency.
+### B. vLLM Omni end-to-end benchmark (pipeline)
+```
+bash benchmarks/qwen3-omni/vllm_omni/eval_qwen3_moe_omni.sh
+```
+What it does:
+- Runs `examples/offline_inference/qwen3_omni/end2end.py` with `--enable-stats`.
+- Uses `benchmarks/build_dataset/top100.txt` and writes to:
+  - Logs: `benchmarks/qwen3-omni/vllm_omni/logs/`
+    - `omni_llm_pipeline_text.orchestrator.stats.jsonl` — per-stage latency stats.
+    - `omni_llm_pipeline_text.overall.stats.jsonl` — end-to-end latency/TPS.
+    - `omni_llm_pipeline_text.stage{0,1,2}.log` — per-stage detailed logs/errors.
+  - Outputs: `benchmarks/qwen3-omni/vllm_omni/outputs/` — ~100 text and `.wav` files.
+Key checks:
+- Overall stats: end-to-end latency/TPS should be reasonable.
+- Orchestrator stats: per-stage latency should be stable; investigate long tails.
+- Stage logs: ensure no errors and no unusually slow stages.
+## Performance snapshot
+The chart below summarizes our measured Qwen3-Omni MoE end-to-end benchmark, comparing vLLM-Omni against HF Transformers. It shows the overall throughput advantage for vLLM-Omni. These are actual experiment results—please refer to this performance when evaluating or reproducing the benchmark.
+![vLLM-Omni vs HF](./vllm-omni-vs-hf.png)
+## Directory layout
+- `benchmarks/build_dataset/` — dataset prep utilities (e.g., SeedTTS top100).
+- `benchmarks/<model>/vllm_omni/` — vLLM-Omni pipeline benchmarks, logs, outputs.
+- Add new tasks under `benchmarks/<model>/...` with the same pattern: `transformers/`, `vllm_omni/`, task-specific README, and (optionally) dataset prep notes.
+- `benchmarks/<model>/vllm-omni-vs-hf.png` — current performance snapshot (overall throughput comparison).
+- `benchmarks/<model>/transformers/` — HF Transformers benchmarks (offline reference).
+## Troubleshooting
+- Make sure GPU/driver/FlashAttention2 requirements are met for the chosen model.
+- If downloads fail, confirm network access to Google Drive (`gdown`) and Hugging Face.
+- If audio files are missing, check for errors in stage logs or model generation.***
--- a/benchmarks/qwen3-omni/transformers/eval_qwen3_moe_omni_transformers.sh
+++ b/benchmarks/qwen3-omni/transformers/eval_qwen3_moe_omni_transformers.sh
+#!/bin/bash
+# Qwen3-Omni Transformers Benchmark Evaluation Script
+# This script must be run from the vllm-omni root directory
+# Get the directory where this script is located
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# Navigate to vllm-omni root directory (4 levels up from script location)
+VLLM_OMNI_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+cd "$VLLM_OMNI_ROOT" || { echo "Error: Failed to navigate to vllm-omni directory"; exit 1; }
+echo "Working directory: $(pwd)"
+# Verify we're in the correct directory and run benchmark
+if [[ ! -f "benchmarks/qwen3-omni/transformers/qwen3_omni_moe_transformers.py" ]]; then
+    echo "Error: Not in vllm-omni root directory. Please run from vllm-omni folder."
+else
+    cd benchmarks/qwen3-omni/transformers
+    python qwen3_omni_moe_transformers.py --prompts_file ../../build_dataset/top100.txt --num_prompts 100
+    echo "Logs and outputs are saved to $(pwd)/benchmark_results:"
+    echo "  - perf_stats.json    Aggregated/per-prompt TPS and latency (thinker/talker/code2wav/overall)"
+    echo "  - results.json       Per-prompt outputs and audio paths"
+    echo "  - audio/             Generated wav files, there should be 100 wav file generated"
+    echo "Key checks: overall_tps and *_tps_avg should be non-zero and stable; investigate 0/NaN or unusually low TPS/long-tail latency."
+fi
--- a/benchmarks/qwen3-omni/transformers/qwen3_omni_moe_model.py
+++ b/benchmarks/qwen3-omni/transformers/qwen3_omni_moe_model.py
+import time
+import torch
+from transformers import Qwen3OmniMoeForConditionalGeneration
+class Qwen3OmniMoeForConditionalGenerationWithLogging(Qwen3OmniMoeForConditionalGeneration):
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: torch.Tensor | None = None,
+        speaker: str = "Ethan",
+        use_audio_in_video: bool = False,
+        return_audio: bool | None = None,
+        thinker_max_new_tokens: int = 1024,
+        thinker_eos_token_id: int = 151645,
+        talker_max_new_tokens: int = 4096,
+        talker_do_sample: bool = True,
+        talker_top_k: int = 50,
+        talker_top_p: float = 1.0,
+        talker_temperature: float = 0.9,
+        talker_repetition_penalty: float = 1.05,
+        **kwargs,
+    ):
+        total_t0 = time.time()
+        perf_stats = {
+            "thinker_tokens": 0,
+            "thinker_time_s": 0.0,
+            "thinker_tps": 0.0,
+            "talker_tokens": 0,
+            "talker_time_s": 0.0,
+            "talker_tps": 0.0,
+            "code2wav_tokens": 0,
+            "code2wav_time_s": 0.0,
+            "code2wav_tps": 0.0,
+            "total_tokens": 0,
+            "total_time_s": 0.0,
+            "total_tps": 0.0,
+        }
+        if return_audio and not self.has_talker:
+            raise ValueError(
+                "Cannot use talker when talker module not initialized. "
+                "Use `enable_talker` method or set enable_talker in config "
+                "to enable talker."
+            )
+        if return_audio is None:
+            return_audio = self.has_talker
+        shared_kwargs = {"use_audio_in_video": use_audio_in_video}
+        thinker_kwargs = {
+            "max_new_tokens": thinker_max_new_tokens,
+            "eos_token_id": thinker_eos_token_id,
+        }
+        talker_kwargs = {}
+        token2wav_kwargs = {}
+        if return_audio:
+            speaker_id = self.config.talker_config.speaker_id.get(speaker.lower())
+            if speaker_id is None:
+                raise NotImplementedError(f"Speaker {speaker} not implemented")
+            if input_ids.shape[0] != 1:
+                raise NotImplementedError("Qwen3-Omni currently does not support batched inference with audio output")
+            talker_suppressed_tokens = [
+                i
+                for i in range(
+                    self.config.talker_config.text_config.vocab_size - 1024,
+                    self.config.talker_config.text_config.vocab_size,
+                )
+                if i != self.config.talker_config.codec_eos_token_id
+            ]  # Suppress additional special tokens, should not be predicted
+            talker_kwargs = {
+                "max_new_tokens": talker_max_new_tokens,
+                "do_sample": talker_do_sample,
+                "top_k": talker_top_k,
+                "top_p": talker_top_p,
+                "temperature": talker_temperature,
+                "eos_token_id": self.config.talker_config.codec_eos_token_id,
+                "repetition_penalty": talker_repetition_penalty,
+                "suppress_tokens": talker_suppressed_tokens,
+                "output_hidden_states": True,
+                "return_dict_in_generate": True,
+            }
+            token2wav_kwargs = {}
+        for key, value in kwargs.items():
+            if key.startswith("thinker_"):
+                thinker_kwargs[key[len("thinker_") :]] = value
+            elif key.startswith("talker_"):
+                talker_kwargs[key[len("talker_") :]] = value
+            elif key.startswith("token2wav_"):
+                token2wav_kwargs[key[len("token2wav_") :]] = value
+            # Process special input values
+            elif key == "feature_attention_mask":
+                thinker_kwargs[key] = value
+                talker_kwargs["audio_feature_lengths"] = torch.sum(value, dim=1)
+            elif key in ("input_features", "attention_mask"):
+                thinker_kwargs[key] = value
+            # Put other key to shared kwargs
+            else:
+                shared_kwargs[key] = value
+        # Merge kwargs
+        for key, value in shared_kwargs.items():
+            if key not in thinker_kwargs:
+                thinker_kwargs[key] = value
+            if key not in talker_kwargs and key in ["image_grid_thw", "video_grid_thw", "video_second_per_grid"]:
+                talker_kwargs[key] = value
+            if key not in token2wav_kwargs:
+                token2wav_kwargs[key] = value
+        # 1. Generate from thinker module
+        generate_audio = return_audio and self.has_talker
+        if generate_audio:
+            thinker_kwargs["output_hidden_states"] = True
+            thinker_kwargs["return_dict_in_generate"] = True
+        t0 = time.time()
+        thinker_result = self.thinker.generate(input_ids=input_ids, **thinker_kwargs)
+        t1 = time.time()
+        perf_stats["thinker_time_s"] = max(0.0, t1 - t0)
+        try:
+            prompt_len = int(input_ids.shape[1]) if input_ids is not None else 0
+            total_len = int(thinker_result.sequences.shape[-1])
+            thinker_out_len = max(0, total_len - prompt_len)
+        except Exception:
+            thinker_out_len = 0
+        perf_stats["thinker_tokens"] = thinker_out_len
+        perf_stats["thinker_tps"] = (
+            (thinker_out_len / perf_stats["thinker_time_s"]) if perf_stats["thinker_time_s"] > 0 else 0.0
+        )
+        if not generate_audio:
+            perf_stats["total_tokens"] = perf_stats["thinker_tokens"]
+            perf_stats["total_time_s"] = time.time() - total_t0
+            perf_stats["total_tps"] = (
+                (perf_stats["total_tokens"] / perf_stats["total_time_s"]) if perf_stats["total_time_s"] > 0 else 0.0
+            )
+            # attach stats to self
+            setattr(self, "_perf_stats_last", perf_stats)
+            if not hasattr(self, "_perf_stats_history"):
+                setattr(self, "_perf_stats_history", [])
+            self._perf_stats_history.append(perf_stats)
+            return thinker_result, None
+        # 2. Prepare talker input
+        thinker_embed = torch.cat([hidden_states[0] for hidden_states in thinker_result.hidden_states], dim=1).to(
+            self.talker.device
+        )  # [1 t d]
+        thinker_hidden = torch.cat(
+            [
+                hidden_states[self.config.talker_config.accept_hidden_layer]
+                for hidden_states in thinker_result.hidden_states
+            ],
+            dim=1,
+        ).to(self.talker.device)  # [1 t d]
+        im_start_indexes = torch.cat(
+            (
+                torch.nonzero(input_ids[0] == self.config.im_start_token_id).squeeze(),
+                torch.tensor([thinker_result.sequences.shape[-1]], device=input_ids.device, dtype=input_ids.dtype),
+            ),
+            dim=-1,
+        ).to(self.talker.device)  # Shape [n_starts + 1]; Take batch 0 since batched inference is not supported here.
+        multimodal_mask = (
+            (thinker_result.sequences == self.config.thinker_config.audio_token_id) |
+            (thinker_result.sequences == self.config.thinker_config.image_token_id) |
+            (thinker_result.sequences == self.config.thinker_config.video_token_id)
+        ).to(self.talker.device)  # [1 t] # fmt: skip
+        talker_special_tokens = torch.tensor(
+            [[self.config.tts_bos_token_id, self.config.tts_eos_token_id, self.config.tts_pad_token_id]],
+            device=self.thinker.device,
+            dtype=input_ids.dtype,
+        )
+        tts_bos_embed, tts_eos_embed, tts_pad_embed = (
+            self.talker.text_projection(self.thinker.get_input_embeddings()(talker_special_tokens))
+            .to(self.talker.device)
+            .chunk(3, dim=1)
+        )  # 3 * [1 1 d]
+        talker_input_embeds = []  # [1 t d]
+        talker_input_ids = []
+        # For every chatml parts
+        for i in range(len(im_start_indexes) - 1):
+            im_start_index = im_start_indexes[i]
+            segment_end_index = im_start_indexes[i + 1]
+            role_token = input_ids[0][im_start_index + 1]
+            # Talker should ignore thinker system prompt
+            if role_token == self.config.system_token_id:
+                continue
+            # Talker takes word embeddings for tokens and hidden state from `accept_hidden_layer` for multimodal inputs
+            elif role_token == self.config.user_token_id:
+                talker_user_part = self._get_talker_user_parts(
+                    im_start_index, segment_end_index, multimodal_mask, thinker_hidden, thinker_embed
+                )
+                talker_input_embeds.append(talker_user_part)
+                talker_input_ids.append(thinker_result.sequences[:, im_start_index:segment_end_index])
+            # Take assistant output (for now)
+            elif role_token == self.config.assistant_token_id and i == len(im_start_indexes) - 2:
+                talker_assistant_embeds, talker_assistant_ids, trailing_text_hidden = self._get_talker_assistant_parts(
+                    im_start_index,
+                    segment_end_index,
+                    speaker_id,
+                    thinker_embed,
+                    tts_pad_embed,
+                    tts_bos_embed,
+                    tts_eos_embed,
+                )
+                talker_input_embeds.append(talker_assistant_embeds)
+                talker_input_ids.append(talker_assistant_ids)
+            # History assistant output (ignore for now)
+            elif role_token == self.config.assistant_token_id and i != len(im_start_indexes) - 2:
+                continue
+            else:
+                raise AssertionError("Expect role id after <|im_start|> (assistant, user, system)")
+        talker_input_embed = torch.cat([embed.to(self.talker.device) for embed in talker_input_embeds], dim=1)
+        talker_input_id = torch.cat([embed.to(self.talker.device) for embed in talker_input_ids], dim=1)
+        t2 = time.time()
+        talker_result = self.talker.generate(
+            inputs_embeds=talker_input_embed,
+            trailing_text_hidden=trailing_text_hidden,
+            tts_pad_embed=tts_pad_embed,
+            talker_input_ids=talker_input_id,  # Not use input_ids to prevent repetition penalty out of bound
+            **talker_kwargs,
+        )
+        t3 = time.time()
+        perf_stats["talker_time_s"] = max(0.0, t3 - t2)
+        talker_codes = (
+            torch.stack([hid[-1] for hid in talker_result.hidden_states if hid[-1] is not None], dim=1)
+            .transpose(1, 2)
+            .to(self.code2wav.device)
+        )
+        try:
+            # codes shape: (B, num_quantizers, T). We log T as token length.
+            perf_stats["talker_tokens"] = int(talker_codes.shape[-1])
+        except Exception:
+            perf_stats["talker_tokens"] = 0
+        perf_stats["talker_tps"] = (
+            (perf_stats["talker_tokens"] / perf_stats["talker_time_s"]) if perf_stats["talker_time_s"] > 0 else 0.0
+        )
+        t4 = time.time()
+        talker_wavs = self.code2wav.chunked_decode(talker_codes, chunk_size=300, left_context_size=25).float()
+        t5 = time.time()
+        perf_stats["code2wav_time_s"] = max(0.0, t5 - t4)
+        perf_stats["code2wav_tokens"] = perf_stats["talker_tokens"]  # same T, not times 16
+        perf_stats["code2wav_tps"] = (
+            (perf_stats["code2wav_tokens"] / perf_stats["code2wav_time_s"])
+            if perf_stats["code2wav_time_s"] > 0
+            else 0.0
+        )
+        perf_stats["total_tokens"] = perf_stats["thinker_tokens"] + perf_stats["talker_tokens"]
+        perf_stats["total_time_s"] = time.time() - total_t0
+        perf_stats["total_tps"] = (
+            (perf_stats["total_tokens"] / perf_stats["total_time_s"]) if perf_stats["total_time_s"] > 0 else 0.0
+        )
+        setattr(self, "_perf_stats_last", perf_stats)
+        if not hasattr(self, "_perf_stats_history"):
+            setattr(self, "_perf_stats_history", [])
+        self._perf_stats_history.append(perf_stats)
+        return thinker_result, talker_wavs.float()
+__all__ = [
+    "Qwen3OmniMoeForConditionalGenerationWithLogging",
+]