Commit c1cacde6 authored by weishb's avatar weishb
Browse files

vllm-omni_0.15.0.rc1+fix1 first commit

parent 35607782
#!/bin/bash
# vllm-omni customized version
# Based on: https://github.com/vllm-project/ci-infra/blob/main/buildkite/bootstrap-amd.sh
# Last synced: 2025-12-15
# Modifications: Use local template file instead of downloading from ci-infra
set -euo pipefail
if [[ -z "${RUN_ALL:-}" ]]; then
RUN_ALL=0
fi
if [[ -z "${NIGHTLY:-}" ]]; then
NIGHTLY=0
fi
if [[ -z "${VLLM_CI_BRANCH:-}" ]]; then
VLLM_CI_BRANCH="main"
fi
if [[ -z "${AMD_MIRROR_HW:-}" ]]; then
AMD_MIRROR_HW="amdproduction"
fi
if [[ -z "${DOCS_ONLY_DISABLE:-}" ]]; then
DOCS_ONLY_DISABLE=0
fi
fail_fast() {
DISABLE_LABEL="ci-no-fail-fast"
# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm-omni/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
if [[ $PR_LABELS == *"$DISABLE_LABEL"* ]]; then
echo false
else
echo true
fi
else
echo false # not a PR or BUILDKITE_PULL_REQUEST not set
fi
}
check_run_all_label() {
RUN_ALL_LABEL="ready-run-all-tests"
# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm-omni/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
if [[ $PR_LABELS == *"$RUN_ALL_LABEL"* ]]; then
echo true
else
echo false
fi
else
echo false # not a PR or BUILDKITE_PULL_REQUEST not set
fi
}
if [[ -z "${COV_ENABLED:-}" ]]; then
COV_ENABLED=0
fi
upload_pipeline() {
echo "Uploading pipeline..."
# Install minijinja
ls .buildkite || buildkite-agent annotate --style error 'Please merge upstream main branch for buildkite CI'
curl -sSfL https://github.com/mitsuhiko/minijinja/releases/download/2.3.1/minijinja-cli-installer.sh | sh
source /var/lib/buildkite-agent/.cargo/env
if [[ $BUILDKITE_PIPELINE_SLUG == "fastcheck" ]]; then
AMD_MIRROR_HW="amdtentative"
fi
# Use local template file for vllm-omni
cp .buildkite/test-template-amd-omni.j2 .buildkite/test-template.j2
# (WIP) Use pipeline generator instead of jinja template
if [ -e ".buildkite/pipeline_generator/pipeline_generator.py" ]; then
python -m pip install click pydantic
python .buildkite/pipeline_generator/pipeline_generator.py --run_all=$RUN_ALL --list_file_diff="$LIST_FILE_DIFF" --nightly="$NIGHTLY" --mirror_hw="$AMD_MIRROR_HW"
buildkite-agent pipeline upload .buildkite/pipeline.yaml
exit 0
fi
echo "List file diff: $LIST_FILE_DIFF"
echo "Run all: $RUN_ALL"
echo "Nightly: $NIGHTLY"
echo "AMD Mirror HW: $AMD_MIRROR_HW"
FAIL_FAST=$(fail_fast)
cd .buildkite
(
set -x
# Output pipeline.yaml with all blank lines removed
minijinja-cli test-template.j2 test-amd.yaml \
-D branch="$BUILDKITE_BRANCH" \
-D list_file_diff="$LIST_FILE_DIFF" \
-D run_all="$RUN_ALL" \
-D nightly="$NIGHTLY" \
-D mirror_hw="$AMD_MIRROR_HW" \
-D fail_fast="$FAIL_FAST" \
-D vllm_use_precompiled="$VLLM_USE_PRECOMPILED" \
-D vllm_merge_base_commit="$(git merge-base origin/main HEAD)" \
-D cov_enabled="$COV_ENABLED" \
-D vllm_ci_branch="$VLLM_CI_BRANCH" \
| sed '/^[[:space:]]*$/d' \
> pipeline.yaml
)
cat pipeline.yaml
buildkite-agent artifact upload pipeline.yaml
buildkite-agent pipeline upload pipeline.yaml
exit 0
}
get_diff() {
$(git add .)
echo $(git diff --name-only --diff-filter=ACMDR $(git merge-base origin/main HEAD))
}
get_diff_main() {
$(git add .)
echo $(git diff --name-only --diff-filter=ACMDR HEAD~1)
}
file_diff=$(get_diff)
if [[ $BUILDKITE_BRANCH == "main" ]]; then
file_diff=$(get_diff_main)
fi
# ----------------------------------------------------------------------
# Early exit start: skip pipeline if conditions are met
# ----------------------------------------------------------------------
# skip pipeline if all changed files are under docs/
if [[ "${DOCS_ONLY_DISABLE}" != "1" ]]; then
if [[ -n "${file_diff:-}" ]]; then
docs_only=1
# Robust iteration over newline-separated file_diff
while IFS= read -r f; do
[[ -z "$f" ]] && continue
# **Policy:** only skip if *every* path starts with docs/
if [[ "$f" != docs/* ]]; then
docs_only=0
break
fi
done < <(printf '%s\n' "$file_diff" | tr ' ' '\n' | tr -d '\r')
if [[ "$docs_only" -eq 1 ]]; then
buildkite-agent annotate ":memo: CI skipped — docs/** only changes detected
\`\`\`
${file_diff}
\`\`\`" --style "info" || true
echo "[docs-only] All changes are under docs/. Exiting before pipeline upload."
exit 0
fi
fi
fi
# ----------------------------------------------------------------------
# Early exit end
# ----------------------------------------------------------------------
patterns=(
"docker/Dockerfile"
"CMakeLists.txt"
"requirements/common.txt"
"requirements/cuda.txt"
"requirements/build.txt"
"requirements/test.txt"
"setup.py"
"csrc/"
"cmake/"
)
ignore_patterns=(
"docker/Dockerfile."
"csrc/cpu"
"csrc/rocm"
"cmake/hipify.py"
"cmake/cpu_extension.cmake"
)
for file in $file_diff; do
# First check if file matches any pattern
matches_pattern=0
for pattern in "${patterns[@]}"; do
if [[ $file == $pattern* ]] || [[ $file == $pattern ]]; then
matches_pattern=1
break
fi
done
# If file matches pattern, check it's not in ignore patterns
if [[ $matches_pattern -eq 1 ]]; then
matches_ignore=0
for ignore in "${ignore_patterns[@]}"; do
if [[ $file == $ignore* ]] || [[ $file == $ignore ]]; then
matches_ignore=1
break
fi
done
if [[ $matches_ignore -eq 0 ]]; then
RUN_ALL=1
echo "Found changes: $file. Run all tests"
break
fi
fi
done
# Check for ready-run-all-tests label
LABEL_RUN_ALL=$(check_run_all_label)
if [[ $LABEL_RUN_ALL == true ]]; then
RUN_ALL=1
NIGHTLY=1
echo "Found 'ready-run-all-tests' label. Running all tests including optional tests."
fi
# Decide whether to use precompiled wheels
# Relies on existing patterns array as a basis.
if [[ -n "${VLLM_USE_PRECOMPILED:-}" ]]; then
echo "VLLM_USE_PRECOMPILED is already set to: $VLLM_USE_PRECOMPILED"
elif [[ $RUN_ALL -eq 1 ]]; then
export VLLM_USE_PRECOMPILED=0
echo "Detected critical changes, building wheels from source"
else
export VLLM_USE_PRECOMPILED=1
echo "No critical changes, using precompiled wheels"
fi
LIST_FILE_DIFF=$(get_diff | tr ' ' '|')
if [[ $BUILDKITE_BRANCH == "main" ]]; then
LIST_FILE_DIFF=$(get_diff_main | tr ' ' '|')
fi
upload_pipeline
steps:
- label: ":docker: Build image"
key: image-build
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker build --file docker/Dockerfile.ci -t vllm-omni-ci ."
- "docker tag vllm-omni-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT"
agents:
queue: "cpu_queue_premerge"
# - label: "Test on NPU"
# depends_on: ~
# key: npu-test
# commands:
# - ".buildkite/scripts/hardware_ci/run_npu_test.sh"
# agents:
# queue: "ascend"
- label: "Simple Unit Test"
depends_on: image-build
commands:
- pytest -v -s tests/entrypoints/
- pytest -v -s tests/diffusion/cache/
- pytest -v -s tests/diffusion/lora/
- pytest -v -s tests/model_executor/models/qwen2_5_omni/test_audio_length.py
- pytest -v -s tests/worker/
- pytest -v -s tests/distributed/omni_connectors/test_kv_flow.py
agents:
queue: "gpu_1_queue"
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"
- label: "Diffusion Model Test"
timeout_in_minutes: 20
depends_on: image-build
commands:
- pytest -s -v tests/e2e/offline_inference/test_t2i_model.py
agents:
queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"
- label: "Diffusion Images API LoRA E2E"
timeout_in_minutes: 20
depends_on: image-build
commands:
- pytest -s -v tests/e2e/online_serving/test_images_generations_lora.py
agents:
queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"
- label: "Diffusion Model CPU offloading Test"
timeout_in_minutes: 20
depends_on: image-build
commands:
- pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
- pytest -s -v tests/e2e/offline_inference/test_diffusion_layerwise_offload.py
agents:
queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"
- label: "Audio Generation Model Test"
timeout_in_minutes: 20
depends_on: image-build
commands:
- pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py
agents:
queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"
- label: "Diffusion Cache Backend Test"
timeout_in_minutes: 15
depends_on: image-build
commands:
- pytest -s -v tests/e2e/offline_inference/test_cache_dit.py tests/e2e/offline_inference/test_teacache.py
agents:
queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"
- label: "Diffusion Sequence Parallelism Test"
timeout_in_minutes: 20
depends_on: image-build
commands:
- pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py
agents:
queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
shm-size: "8gb"
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"
- label: "Diffusion Tensor Parallelism Test"
timeout_in_minutes: 20
depends_on: image-build
commands:
- pytest -s -v tests/e2e/offline_inference/test_zimage_tensor_parallel.py
agents:
queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
shm-size: "8gb"
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"
- label: "Diffusion GPU Worker Test"
timeout_in_minutes: 20
depends_on: image-build
commands:
- pytest -s -v tests/diffusion/test_diffusion_worker.py
agents:
queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
shm-size: "8gb"
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"
- label: "Benchmark Test"
timeout_in_minutes: 15
depends_on: image-build
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/benchmarks/test_serve_cli.py
agents:
queue: "mithril-h100-pool"
plugins:
- kubernetes:
podSpec:
containers:
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
resources:
limits:
nvidia.com/gpu: 2
volumeMounts:
- name: devshm
mountPath: /dev/shm
- name: hf-cache
mountPath: /root/.cache/huggingface
env:
- name: HF_HOME
value: /root/.cache/huggingface
nodeSelector:
node.kubernetes.io/instance-type: gpu-h100-sxm
volumes:
- name: devshm
emptyDir:
medium: Memory
- name: hf-cache
hostPath:
path: /mnt/hf-cache
type: DirectoryOrCreate
- label: "Omni Model Test"
timeout_in_minutes: 15
depends_on: image-build
commands:
- export VLLM_LOGGING_LEVEL=DEBUG
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
agents:
queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"
# - label: "Omni Model Test with H100"
# timeout_in_minutes: 30
# depends_on: image-build
# commands:
# - export VLLM_WORKER_MULTIPROC_METHOD=spawn
# - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
# - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
# - pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py
# - pytest -s -v tests/e2e/online_serving/test_async_omni.py
# agents:
# queue: "mithril-h100-pool"
# plugins:
# - kubernetes:
# podSpec:
# containers:
# - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
# resources:
# limits:
# nvidia.com/gpu: 2
# volumeMounts:
# - name: devshm
# mountPath: /dev/shm
# - name: hf-cache
# mountPath: /root/.cache/huggingface
# env:
# - name: HF_HOME
# value: /root/.cache/huggingface
# nodeSelector:
# node.kubernetes.io/instance-type: gpu-h100-sxm
# volumes:
# - name: devshm
# emptyDir:
# medium: Memory
# - name: hf-cache
# hostPath:
# path: /mnt/hf-cache
# type: DirectoryOrCreate
- label: "Diffusion Image Edit Test with H100 (1 GPU)"
timeout_in_minutes: 20
depends_on: image-build
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py
agents:
queue: "mithril-h100-pool"
plugins:
- kubernetes:
podSpec:
containers:
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
resources:
limits:
nvidia.com/gpu: 1
volumeMounts:
- name: devshm
mountPath: /dev/shm
- name: hf-cache
mountPath: /root/.cache/huggingface
env:
- name: HF_HOME
value: /root/.cache/huggingface
nodeSelector:
node.kubernetes.io/instance-type: gpu-h100-sxm
volumes:
- name: devshm
emptyDir:
medium: Memory
- name: hf-cache
hostPath:
path: /mnt/hf-cache
type: DirectoryOrCreate
# - label: "Bagel Text2Img Model Test with H100"
# timeout_in_minutes: 30
# depends_on: image-build
# commands:
# - export VLLM_WORKER_MULTIPROC_METHOD=spawn
# - pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py
# agents:
# queue: "mithril-h100-pool"
# plugins:
# - kubernetes:
# podSpec:
# containers:
# - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
# resources:
# limits:
# nvidia.com/gpu: 1
# volumeMounts:
# - name: devshm
# mountPath: /dev/shm
# - name: hf-cache
# mountPath: /root/.cache/huggingface
# env:
# - name: HF_HOME
# value: /root/.cache/huggingface
# nodeSelector:
# node.kubernetes.io/instance-type: gpu-h100-sxm
# volumes:
# - name: devshm
# emptyDir:
# medium: Memory
# - name: hf-cache
# hostPath:
# path: /mnt/hf-cache
# type: DirectoryOrCreate
#!/bin/bash
# Helper function to safely login to ECR Public with per-job config isolation
# Uses DOCKER_CONFIG environment variable to prevent race conditions
#
# This script prevents the "device or resource busy" error by giving each
# Buildkite job its own isolated Docker config directory.
#
# Usage:
# source docker_login_ecr_public.sh && safe_docker_login_ecr_public
set -euo pipefail
# Configuration
ECR_REGISTRY="public.ecr.aws"
setup_isolated_docker_config() {
# Use BUILDKITE_JOB_ID for job-specific isolation
# Fallback to PID if running outside Buildkite
local job_id="${BUILDKITE_JOB_ID:-$$}"
# Set Docker config to job-specific directory
export DOCKER_CONFIG="/tmp/docker-config-${job_id}"
# Create directory if it doesn't exist
mkdir -p "$DOCKER_CONFIG"
echo "[docker-config] Using isolated Docker config: $DOCKER_CONFIG"
}
check_docker_auth() {
# Check if already authenticated to the given registry
# Returns 0 if authenticated, 1 if not
local registry="$1"
# Check if credentials exist in the isolated config
if [[ -f "$DOCKER_CONFIG/config.json" ]]; then
# Check if registry is present in config
if grep -q "$registry" "$DOCKER_CONFIG/config.json" 2>/dev/null; then
return 0
fi
fi
return 1
}
safe_docker_login_ecr_public() {
# Setup isolated config first
setup_isolated_docker_config
local registry="$ECR_REGISTRY"
# Check if already authenticated (within this job)
if check_docker_auth "$registry"; then
echo "[docker-login] Already authenticated to $registry in this job"
return 0
fi
# Perform login to isolated config directory
echo "[docker-login] Logging in to $ECR_REGISTRY (isolated config)..."
if aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$ECR_REGISTRY"; then
echo "[docker-login] Login successful (config: $DOCKER_CONFIG)"
return 0
else
local exit_code=$?
echo "[docker-login] ERROR: Login failed with exit code $exit_code" >&2
return $exit_code
fi
}
# Execute if run as script (not sourced)
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
safe_docker_login_ecr_public
fi
#!/bin/bash
# vllm-omni customized version
# Based on: vllm/.buildkite/scripts/hardware_ci/run-amd-test.sh
# Last synced: 2025-12-15
# Modifications: docker image name for vllm-omni
# This script runs test inside the corresponding ROCm docker container.
set -o pipefail
# Export Python path
export PYTHONPATH=".."
# Print ROCm version
echo "--- Confirming Clean Initial State"
while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done
echo "--- ROCm info"
rocminfo
# cleanup older docker images
cleanup_docker() {
# Get Docker's root directory
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi
}
# Call the cleanup docker function
cleanup_docker
echo "--- Resetting GPUs"
echo "reset" > /opt/amdgpu/etc/gpu_state
while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done
echo "--- Pulling container"
image_name="public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${BUILDKITE_COMMIT}-rocm-omni"
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
# Install AWS CLI to authenticate to ECR Public Gallery to get higher rate limit for pulling images
sudo apt-get update && sudo apt-get install -y awscli
# Use safe docker login helper to prevent race conditions
source "$(dirname "${BASH_SOURCE[0]}")/../docker_login_ecr_public.sh"
safe_docker_login_ecr_public
# Pull the container from ECR Public Gallery
docker pull "${image_name}"
remove_docker_container() {
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
}
trap remove_docker_container EXIT
echo "--- Running container"
HF_CACHE="$(realpath ~)/huggingface"
mkdir -p "${HF_CACHE}"
HF_MOUNT="/root/.cache/huggingface"
commands=$@
echo "Commands:$commands"
PARALLEL_JOB_COUNT=8
MYPYTHONPATH=".."
# Test that we're launching on the machine that has
# proper access to GPUs
render_gid=$(getent group render | cut -d: -f3)
if [[ -z "$render_gid" ]]; then
echo "Error: 'render' group not found. This is required for GPU access." >&2
exit 1
fi
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
# assign job count as the number of shards used
commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
# assign shard-id for each shard
commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
echo "Shard ${GPU} commands:$commands_gpu"
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network=host \
--shm-size=16gb \
--group-add "$render_gid" \
--rm \
-e MIOPEN_DEBUG_CONV_DIRECT=0 \
-e MIOPEN_DEBUG_CONV_GEMM=0 \
-e VLLM_ROCM_USE_AITER=1 \
-e HIP_VISIBLE_DEVICES="${GPU}" \
-e HF_TOKEN \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
-e "PYTHONPATH=${MYPYTHONPATH}" \
--name "${container_name}_${GPU}" \
"${image_name}" \
/bin/bash -c "${commands_gpu}" \
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
PIDS+=($!)
done
#wait for all processes to finish and collect exit codes
for pid in "${PIDS[@]}"; do
wait "${pid}"
STATUS+=($?)
done
for st in "${STATUS[@]}"; do
if [[ ${st} -ne 0 ]]; then
echo "One of the processes failed with $st"
exit "${st}"
fi
done
else
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network=host \
--shm-size=16gb \
--group-add "$render_gid" \
--rm \
-e MIOPEN_DEBUG_CONV_DIRECT=0 \
-e MIOPEN_DEBUG_CONV_GEMM=0 \
-e VLLM_ROCM_USE_AITER=1 \
-e HF_TOKEN \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
-e "PYTHONPATH=${MYPYTHONPATH}" \
--name "${container_name}" \
"${image_name}" \
/bin/bash -c "${commands}"
fi
#!/bin/bash
# This script build the Ascend NPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Base ubuntu image with basic ascend development libraries and python installed
VLLM_OMNI_REPO="https://github.com/vllm-project/vllm-omni.git"
BASE_IMAGE_NAME="quay.nju.edu.cn/ascend/vllm-ascend:v0.11.0rc2"
image_name="npu/vllm-omni-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
# image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
echo "agent_idx: ${agent_idx}"
builder_name="cachebuilder${agent_idx}"
builder_cache_dir="/mnt/docker-cache${agent_idx}"
mkdir -p ${builder_cache_dir}
# Try building the docker image
cat <<EOF | DOCKER_BUILDKIT=1 docker build \
--add-host pypi-cache:${PYPI_CACHE_HOST} \
--builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
--cache-to type=local,dest=${builder_cache_dir},mode=max \
--build-arg BUILDKITE_PULL_REQUEST="${BUILDKITE_PULL_REQUEST}" \
--build-arg BUILDKITE_PULL_REQUEST_REPO="${BUILDKITE_PULL_REQUEST_REPO}" \
--progress=plain --load -t ${image_name} -f - .
FROM ${BASE_IMAGE_NAME}
# Define environments
ENV DEBIAN_FRONTEND=noninteractive
RUN pip config set global.index-url http://pypi-cache:${PYPI_CACHE_PORT}/pypi/simple && \
pip config set global.trusted-host pypi-cache && \
apt-get update -y && \
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
rm -rf /var/cache/apt/* && \
rm -rf /var/lib/apt/lists/*
# Install for pytest to make the docker build cache layer always valid
RUN --mount=type=cache,target=/root/.cache/pip \
pip install pytest>=6.0 pytest-cov modelscope
COPY . .
# Install vllm-omni
WORKDIR /workspace
ARG VLLM_OMNI_REPO=https://github.com/vllm-project/vllm-omni.git
ARG VLLM_OMNI_TAG=main
ARG BUILDKITE_PULL_REQUEST
ARG BUILDKITE_PULL_REQUEST_REPO
RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
if [ "\$BUILDKITE_PULL_REQUEST" != "false" ] && [ -n "\$BUILDKITE_PULL_REQUEST" ]; then \
echo "Cloning and checking out PR #\$BUILDKITE_PULL_REQUEST..." && \
git clone \$VLLM_OMNI_REPO /workspace/vllm-omni && \
cd /workspace/vllm-omni && \
git fetch origin pull/\$BUILDKITE_PULL_REQUEST/head:pr-\$BUILDKITE_PULL_REQUEST && \
git checkout pr-\$BUILDKITE_PULL_REQUEST; \
else \
echo "Not a PR build, using main branch" && \
git clone --depth 1 \$VLLM_OMNI_REPO /workspace/vllm-omni; \
fi
RUN --mount=type=cache,target=/root/.cache/pip \
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
python3 -m pip install -v -e /workspace/vllm-omni/
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
WORKDIR /workspace/vllm-omni
CMD ["/bin/bash"]
EOF
# Setup cleanup
remove_docker_container() {
docker rm -f "${container_name}" || true;
docker image rm -f "${image_name}" || true;
docker system prune -f || true;
}
trap remove_docker_container EXIT
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
# returns --device /dev/davinci0 --device /dev/davinci1
parse_and_gen_devices() {
local input="$1"
local index cards_num
if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
index="${BASH_REMATCH[1]}"
cards_num="${BASH_REMATCH[2]}"
else
echo "parse error" >&2
return 1
fi
local devices=""
local i=0
while (( i < cards_num )); do
local dev_idx=$(((index - 1)*cards_num + i ))
devices="$devices --device /dev/davinci${dev_idx}"
((i++))
done
# trim leading space
devices="${devices#"${devices%%[![:space:]]*}"}"
# Output devices: assigned to the caller variable
printf '%s' "$devices"
}
devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
# This test checks whether the OOT platform interface is functioning properly in conjunction with
# the hardware plugin vllm-ascend.
hf_model_cache_dir=/mnt/hf_cache${agent_idx}
ms_model_cache_dir=/mnt/modelscope${agent_idx}
mkdir -p ${hf_model_cache_dir}
mkdir -p ${ms_model_cache_dir}
docker run \
--init \
${devices} \
--device /dev/davinci_manager \
--device /dev/devmm_svm \
--device /dev/hisi_hdc \
-v /usr/local/dcmi:/usr/local/dcmi \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /etc/ascend_install.info:/etc/ascend_install.info \
-v ${hf_model_cache_dir}:/root/.cache/huggingface \
-v ${ms_model_cache_dir}:/root/.cache/modelscope \
--network host \
--entrypoint="" \
--name "${container_name}" \
"${image_name}" \
bash -c '
set -e
VLLM_USE_MODELSCOPE=True pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
'
steps:
- label: "Diffusion Model Test"
timeout_in_minutes: 20
agent_pool: mi325_2
depends_on: amd-build
mirror_hardwares: [amdproduction]
grade: Blocking
commands:
- export GPU_ARCHS=gfx942
- pytest -s -v tests/e2e/offline_inference/test_t2i_model.py
- label: "Diffusion Images API LoRA E2E"
timeout_in_minutes: 20
agent_pool: mi325_1
depends_on: amd-build
mirror_hardwares: [amdproduction]
grade: Blocking
commands:
- export GPU_ARCHS=gfx942
- export VLLM_LOGGING_LEVEL=DEBUG
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/e2e/online_serving/test_images_generations_lora.py
- label: "Diffusion Model CPU offloading Test"
timeout_in_minutes: 20
agent_pool: mi325_1
depends_on: amd-build
mirror_hardwares: [amdproduction]
grade: Blocking
commands:
- export GPU_ARCHS=gfx942
- export VLLM_LOGGING_LEVEL=DEBUG
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
- label: "Diffusion Cache Backend Test"
timeout_in_minutes: 15
agent_pool: mi325_1
depends_on: amd-build
mirror_hardwares: [amdproduction]
grade: Blocking
commands:
- export GPU_ARCHS=gfx942
- export VLLM_LOGGING_LEVEL=DEBUG
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/e2e/offline_inference/test_cache_dit.py tests/e2e/offline_inference/test_teacache.py
- label: "Diffusion Sequence Parallelism Test"
timeout_in_minutes: 20
agent_pool: mi325_2
depends_on: amd-build
mirror_hardwares: [amdproduction]
grade: Blocking
commands:
- export GPU_ARCHS=gfx942
- export VLLM_LOGGING_LEVEL=DEBUG
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py
- label: "Diffusion Tensor Parallelism Test"
timeout_in_minutes: 20
agent_pool: mi325_2
depends_on: amd-build
commands:
- export GPU_ARCHS=gfx942
- export VLLM_LOGGING_LEVEL=DEBUG
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/e2e/offline_inference/test_zimage_tensor_parallel.py
- label: "Diffusion GPU Worker Test"
timeout_in_minutes: 20
agent_pool: mi325_2
depends_on: amd-build
mirror_hardwares: [amdproduction]
grade: Blocking
commands:
- pytest -s -v tests/diffusion/test_diffusion_worker.py
- label: "Omni Model Test Qwen2-5-Omni"
timeout_in_minutes: 15
agent_pool: mi325_2
depends_on: amd-build
mirror_hardwares: [amdproduction]
grade: Blocking
commands:
- export GPU_ARCHS=gfx942
- export VLLM_LOGGING_LEVEL=DEBUG
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
- label: "Omni Model Test Qwen3-Omni"
timeout_in_minutes: 15
agent_pool: mi325_2
depends_on: amd-build
mirror_hardwares: [amdproduction]
grade: Blocking
commands:
- export VLLM_LOGGING_LEVEL=DEBUG
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- export VLLM_TEST_CLEAN_GPU_MEMORY="1"
- pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
- pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py
- pytest -s -v tests/e2e/online_serving/test_async_omni.py
- label: "Diffusion Image Edit Test"
timeout_in_minutes: 15
agent_pool: mi325_1
depends_on: amd-build
mirror_hardwares: [amdproduction]
grade: Blocking
commands:
- export GPU_ARCHS=gfx942
- export VLLM_LOGGING_LEVEL=DEBUG
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py
{# vllm-omni customized version
Based on: https://github.com/vllm-project/ci-infra/blob/main/buildkite/test-template-amd.j2
Last synced: 2025-12-15
Modifications: Removed unused CUDA/NVIDIA logic, keeping only AMD tests
#}
{% set docker_image_amd = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT-rocm-omni" %}
{% set default_working_dir = "/app/vllm-omni" %}
- group: "AMD Tests"
depends_on: ~
steps:
- label: "AMD: :docker: build image"
depends_on: ~
soft_fail: false
commands:
- "source .buildkite/scripts/docker_login_ecr_public.sh && safe_docker_login_ecr_public"
- "docker build -f docker/Dockerfile.rocm -t {{ docker_image_amd }} --target final --progress plain ."
- "docker push {{ docker_image_amd }}"
key: "amd-build"
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 1
- exit_status: -10 # Agent was lost
limit: 1
- exit_status: 1 # Machine occasionally fail
limit: 1
agents:
queue: cpu_queue_premerge
{% for step in steps %}
{% if step.mirror_hardwares and mirror_hw in step.mirror_hardwares %}
- label: "{{ step.agent_pool }}: {{ step.label }}"
depends_on: amd-build
agents:
{% if step.agent_pool %}
queue: amd_{{ step.agent_pool }}
{% else %}
queue: amd_mi325_1
{% endif %}
command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh "(command rocm-smi || true) && cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}"
env:
DOCKER_BUILDKIT: "1"
priority: 100
{% if step.grade and step.grade == "Blocking" %}
soft_fail: false
{% else %}
soft_fail: true
{% endif%}
{% endif %}
{% endfor %}
default_install_hook_types:
- pre-commit
- commit-msg
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v6.0.0
hooks:
# list of supported hooks: https://pre-commit.com/hooks.html
- id: check-yaml
args: ["--unsafe"]
- id: debug-statements
- id: end-of-file-fixer
- id: mixed-line-ending
args: ["--fix=lf"]
- id: trailing-whitespace
args: ["--markdown-linebreak-ext=md"]
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.14.10
hooks:
- id: ruff-check
args: [--output-format, github, --fix]
- id: ruff-format
- repo: https://github.com/crate-ci/typos
rev: typos-dict-v0.13.13
hooks:
- id: typos
# only for staged files
- repo: https://github.com/rhysd/actionlint
# v1.7.8+ sets `go 1.24.0` in go.mod, which older Go toolchains (and most
# current CI images) cannot parse. Pin to v1.7.7 until actionlint fixes the
# go.mod directive.
rev: v1.7.7
hooks:
- id: actionlint
files: ^\.github/workflows/.*\.ya?ml$
- repo: local
hooks:
- id: signoff-commit
name: Sign-off Commit
entry: bash
args:
- -c
- |
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)"
fi
language: system
verbose: true
stages: [commit-msg]
# Keep `suggestion` last
- id: suggestion
name: Suggestion
entry: bash -c 'echo "To bypass all the pre-commit hooks, add --no-verify to git commit. To skip a specific hook, prefix the commit command with SKIP=<hook-id>."'
language: system
verbose: true
pass_filenames: false
# Insert new entries above the `suggestion` entry
- id: check-pickle-imports
name: Prevent new pickle/cloudpickle imports
entry: python tools/pre_commit/check_pickle_imports.py
language: python
types: [python]
additional_dependencies: [regex]
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
version: 2
build:
os: ubuntu-22.04
tools:
python: "3.12"
jobs:
post_checkout:
- git fetch --unshallow || true
mkdocs:
configuration: mkdocs.yml
fail_on_warning: true
# Optionally declare the Python requirements required to build your docs
python:
install:
- method: pip
path: .
extra_requirements:
- docs
# Contributing to vLLM-Omni
You may find information about contributing to vLLM-Omni on [Contributing](https://vllm-omni.readthedocs.io/en/latest/contributing/)
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
# vllm-omni <p align="center">
<picture>
<source media="(prefers-color-scheme: dark)" src="https://raw.githubusercontent.com/vllm-project/vllm-omni/refs/heads/main/docs/source/logos/vllm-omni-logo.png">
<img alt="vllm-omni" src="https://raw.githubusercontent.com/vllm-project/vllm-omni/refs/heads/main/docs/source/logos/vllm-omni-logo.png" width=55%>
</picture>
</p>
<h3 align="center">
Easy, fast, and cheap omni-modality model serving for everyone
</h3>
vLLM 最初是为支持文本生成任务的大型语言模型而设计的。vLLM-Omni 是一个框架,它将 vLLM 的支持扩展到全模态模型推理和服务的领域。 <p align="center">
\ No newline at end of file | <a href="https://vllm-omni.readthedocs.io/en/latest/"><b>Documentation</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> | <a href="docs/assets/WeChat.jpg"><b>WeChat</b></a> |
</p>
---
*Latest News* 🔥
- [2026/02] We released [0.14.0](https://github.com/vllm-project/vllm-omni/releases/tag/v0.14.0) - This is the first **stable release** of vLLM-Omni that expands Omni’s diffusion / image-video generation and audio / TTS stack, improves distributed execution and memory efficiency, and broadens platform/backend coverage (GPU/ROCm/NPU/XPU). It also brings meaningful upgrades to serving APIs, profiling & benchmarking, and overall stability. Please check our latest [paper](https://arxiv.org/abs/2602.02204) for architecture design and performance results.
- [2026/01] We released [0.12.0rc1](https://github.com/vllm-project/vllm-omni/releases/tag/v0.12.0rc1) - a major RC milestone focused on maturing the diffusion stack, strengthening OpenAI-compatible serving, expanding omni-model coverage, and improving stability across platforms (GPU/NPU/ROCm), please check our latest [design](https://docs.google.com/presentation/d/1qv4qMW1rKAqDREMXiUDLIgqqHQe7TDPj/edit?usp=sharing&ouid=110473603432222024453&rtpof=true&sd=true).
- [2025/11] vLLM community officially released [vllm-project/vllm-omni](https://github.com/vllm-project/vllm-omni) in order to support omni-modality models serving.
---
## About
[vLLM](https://github.com/vllm-project/vllm) was originally designed to support large language models for text-based autoregressive generation tasks. vLLM-Omni is a framework that extends its support for omni-modality model inference and serving:
- **Omni-modality**: Text, image, video, and audio data processing
- **Non-autoregressive Architectures**: extend the AR support of vLLM to Diffusion Transformers (DiT) and other parallel generation models
- **Heterogeneous outputs**: from traditional text generation to multimodal outputs
<p align="center">
<picture>
<img alt="vllm-omni" src="https://raw.githubusercontent.com/vllm-project/vllm-omni/refs/heads/main/docs/source/architecture/omni-modality-model-architecture.png" width=55%>
</picture>
</p>
vLLM-Omni is fast with:
- State-of-the-art AR support by leveraging efficient KV cache management from vLLM
- Pipelined stage execution overlapping for high throughput performance
- Fully disaggregation based on OmniConnector and dynamic resource allocation across stages
vLLM-Omni is flexible and easy to use with:
- Heterogeneous pipeline abstraction to manage complex model workflows
- Seamless integration with popular Hugging Face models
- Tensor, pipeline, data and expert parallelism support for distributed inference
- Streaming outputs
- OpenAI-compatible API server
vLLM-Omni seamlessly supports most popular open-source models on HuggingFace, including:
- Omni-modality models (e.g. Qwen-Omni)
- Multi-modality generation models (e.g. Qwen-Image)
## Getting Started
Visit our [documentation](https://vllm-omni.readthedocs.io/en/latest/) to learn more.
- [Installation](https://vllm-omni.readthedocs.io/en/latest/getting_started/installation/)
- [Quickstart](https://vllm-omni.readthedocs.io/en/latest/getting_started/quickstart/)
- [List of Supported Models](https://vllm-omni.readthedocs.io/en/latest/models/supported_models/)
## Contributing
We welcome and value any contributions and collaborations.
Please check out [Contributing to vLLM-Omni](https://vllm-omni.readthedocs.io/en/latest/contributing/) for how to get involved.
## Citation
If you use vLLM-Omni for your research, please cite our [paper](https://arxiv.org/abs/2602.02204):
```bibtex
@article{yin2026vllmomni,
title={vLLM-Omni: Fully Disaggregated Serving for Any-to-Any Multimodal Models},
author={Peiqi Yin, Jiangyun Zhu, Han Gao, Chenguang Zheng, Yongxiang Huang, Taichang Zhou, Ruirui Yang, Weizhi Liu, Weiqing Chen, Canlin Guo, Didan Deng, Zifeng Mo, Cong Wang, James Cheng, Roger Wang, Hongsheng Liu},
journal={arXiv preprint arXiv:2602.02204},
year={2026}
}
```
## Join the Community
Feel free to ask questions, provide feedbacks and discuss with fellow users of vLLM-Omni in `#sig-omni` slack channel at [slack.vllm.ai](https://slack.vllm.ai) or vLLM user forum at [discuss.vllm.ai](https://discuss.vllm.ai).
## Star History
[![Star History Chart](https://api.star-history.com/svg?repos=vllm-project/vllm-omni&type=date&legend=top-left)](https://www.star-history.com/#vllm-project/vllm-omni&type=date&legend=top-left)
## License
Apache License 2.0, as found in the [LICENSE](./LICENSE) file.
# Benchmarks Overview and Architecture
This document explains the benchmark architecture across all benchmark assets in this repo. It describes what we measure, and where to find or plug in new scenarios. Per-task details remain in subfolder READMEs (e.g., `benchmarks/<model>/README.md`).
## Scope and goals
- Establish repeatable latency/throughput measurements for multimodal LLM pipelines.
- Provide both HF Transformers (offline) and vLLM-Omni (multi-stage/pipeline) baselines.
- Make it easy to plug in new datasets and models with minimal changes to the runner scripts.
## Dataset and inputs
- Default example: SeedTTS top-100 prompts (`benchmarks/build_dataset/top100.txt`) via `benchmarks/build_dataset/`.
- Extensible: drop in new prompt files or modality-aligned payloads; keep the expected format for the consuming scripts (e.g., one prompt per line).
- If you add a new dataset, document it under `benchmarks/<model>/README.md` and point scripts to your data path.
## Directory layout
- `benchmarks/build_dataset/` — dataset prep utilities (e.g., SeedTTS top100).
- `benchmarks/<model>/vllm_omni/` — vLLM-Omni pipeline benchmarks, logs, outputs.
- Add new tasks under `benchmarks/<model>/...` with the same pattern: `transformers/`, `vllm_omni/`, task-specific README, and (optionally) dataset prep notes.
## Reference workflows
- **HF Transformers (offline, single process)**
Script (example): `benchmarks/<model>/transformers/eval_qwen3_moe_omni_transformers.sh`
Outputs: `benchmark_results/perf_stats.json`, `benchmark_results/results.json`, `benchmark_results/audio/` (if audio is produced).
- **vLLM-Omni end-to-end pipeline**
Script (example): `benchmarks/<model>/vllm_omni/eval_qwen3_moe_omni.sh`
Outputs: `vllm_omni/logs/*.stats.jsonl` (per-stage/overall latency & TPS), `vllm_omni/logs/stage*.log`, `vllm_omni/outputs/` (text/audio artifacts).
- **Adding a new task/model**
1) Create `benchmarks/<model>/transformers/` and/or `benchmarks/<model>/vllm_omni/` with scripts referencing your model and dataset.
2) Add a task README describing dataset, configs, and expected outputs.
3) Keep the output/log structure similar for easy comparison (perf_stats/results/audio or text outputs; stats.jsonl/logs for pipeline).
## Metrics to watch
- **Throughput**: `overall_tps`, `*_tps_avg` per stage.
- **Latency distribution**: look for long tails in `*.stats.jsonl`.
- **Quality/completeness**: missing outputs or errors in stage logs indicate pipeline failures or misconfigurations.
## Troubleshooting
- Verify GPU/driver/FlashAttention2 requirements for your chosen model/config.
- Ensure network access for dataset/model downloads (Google Drive, Hugging Face, etc.).
- If outputs are missing or slow, inspect per-stage logs and `*.stats.jsonl` for errors, stragglers, or contention.
# Benchmark Dataset Preparation Guide
This guide describes how to download and prepare the SeedTTS test dataset for benchmarking Qwen-Omni models.
## Prerequisites
- Python 3.8+
- `gdown` for downloading from Google Drive
- Access to the benchmark scripts
## Steps
### 1. Navigate to the Dataset Directory
```bash
cd benchmarks/build_dataset
```
### 2. Install Dependencies
```bash
pip install gdown
```
### 3. Download the SeedTTS Test Dataset
Download the dataset from Google Drive:
```bash
gdown --id 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP
```
### 4. Extract the Dataset
```bash
tar -xf seedtts_testset.tar
```
### 5. Prepare the Metadata File
Copy the English metadata file to the working directory:
```bash
cp seedtts_testset/en/meta.lst meta.lst
```
### 6. Extract Prompts
Extract the first N prompts from the metadata file:
```bash
# Extract top 100 prompts (adjust -n for different amounts)
python extract_tts_prompts.py -i meta.lst -o top100.txt -n 100
```
**Options:**
- `-i, --input`: Input metadata file (default: `meta.lst`)
- `-o, --output`: Output prompts file (default: `prompts.txt`)
- `-n, --num_lines`: Number of prompts to extract (required)
### 7. Clean Up (Optional)
Remove temporary files to save disk space:
```bash
rm -rf seedtts_testset
rm seedtts_testset.tar
rm meta.lst
```
## Quick Start (All-in-One)
```bash
# Full setup and benchmark
cd benchmarks/build_dataset
pip install gdown
gdown --id 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP
tar -xf seedtts_testset.tar
cp seedtts_testset/en/meta.lst meta.lst
python extract_tts_prompts.py -i meta.lst -o top100.txt -n 100
rm -rf seedtts_testset seedtts_testset.tar meta.lst
```
#!/usr/bin/env python3
"""
Extract prompts from meta.lst and save them to a txt file.
Each line in meta.lst has the format:
ID|prompt_text|audio_path|target_text
This script extracts the prompt_text (second field) from the first N lines.
"""
import argparse
from pathlib import Path
def extract_prompts(input_file: str, output_file: str, num_lines: int) -> None:
"""
Extract prompts from meta.lst and save to output file.
Args:
input_file: Path to the meta.lst file
output_file: Path to the output txt file
num_lines: Number of lines to process
"""
prompts = []
with open(input_file, encoding="utf-8") as f:
for i, line in enumerate(f):
if i >= num_lines:
break
line = line.strip()
if not line: # Skip empty lines
continue
parts = line.split("|")
if len(parts) >= 2:
prompt = parts[1] # The prompt is the second field
prompts.append(prompt)
# Write prompts to output file
with open(output_file, "w", encoding="utf-8") as f:
for prompt in prompts:
f.write(prompt + "\n")
# Print result stats
print(f"Extracted {len(prompts)} prompts from first {num_lines} lines")
print(f"Saved to: {output_file}")
def main():
parser = argparse.ArgumentParser(description="Extract prompts from meta.lst file")
parser.add_argument(
"-i", "--input", type=str, default="meta.lst", help="Input meta.lst file path (default: meta.lst)"
)
parser.add_argument(
"-o", "--output", type=str, default="prompts.txt", help="Output txt file path (default: prompts.txt)"
)
parser.add_argument(
"-n", "--num_lines", type=int, required=True, help="Number of lines to extract from the beginning"
)
args = parser.parse_args()
# Check if input file exists
if not Path(args.input).exists():
print(f"Error: Input file '{args.input}' not found")
return
extract_prompts(args.input, args.output, args.num_lines)
if __name__ == "__main__":
main()
# Diffusion Serving Benchmark (Image/Video)
This folder contains an online-serving benchmark script for diffusion models.
It sends requests to a vLLM OpenAI-compatible endpoint and reports throughput,
latency percentiles, and optional SLO attainment.
The main entrypoint is:
- `benchmarks/diffusion/diffusion_benchmark_serving.py`
## 1. Quick Start
1. Start the server:
```bash
vllm serve Qwen/Qwen-Image --omni --port 8099
```
2. Run a minimal benchmark:
```bash
python3 benchmarks/diffusion/diffusion_benchmark_serving.py \
--base-url http://localhost:8099 \
--model Qwen/Qwen-Image \
--task t2i \
--dataset vbench \
--num-prompts 5
```
**Notes**
- The benchmark talks to `http://<host>:<port>/v1/chat/completions`.
- If you run the server on another host or port, pass `--base-url` accordingly.
## 2. Supported Datasets
The benchmark supports three dataset modes via `--dataset`:
- `vbench`: Built-in prompt/data loader.
- `trace`: Heterogeneous request traces (each request can have different resolution/frames/steps).
- `random`: Synthetic prompts for quick smoke tests.
### VBench dataset
If you use i2v/i2i bench datasets and need auto-download support, you may need:
```bash
uv pip install gdown
```
### Trace dataset
Use `--dataset trace` to replay a trace file. The trace can specify per-request fields such as:
- `width`, `height`
- `num_frames` (video)
- `num_inference_steps`
- `seed`, `fps`
- optional `slo_ms` (per-request SLO target)
By default (when `--dataset-path` is not provided), the script downloads a default trace from
the HuggingFace dataset repo `asukaqaqzz/Dit_Trace`. The default filename can depend on `--task`
(e.g., `t2v` uses a video trace).
Current defaults:
- `--task t2i` -> `sd3_trace.txt`
- `--task t2v` -> `cogvideox_trace.txt`
You can point to your own trace using `--dataset-path`.
## 3. Benchmark Parameters
### Basic flags
- `--base-url`: Server address (the script calls `.../v1/chat/completions`).
- `--model`: The OpenAI-compatible `model` field.
- `--task`: Task type (e.g., `t2i`, `t2v`, `i2i`, `i2v`).
- `--dataset`: Dataset mode (`vbench` / `trace` / `random`).
- `--num-prompts`: Number of requests to send.
Common optional flags:
- `--output-file`: Write metrics to a JSON file.
- `--disable-tqdm`: Disable the progress bar.
### Resolution / frames / steps: CLI defaults vs dataset fields
Related flags: `--width`, `--height`, `--num-frames`, `--fps`, `--num-inference-steps`.
- For `vbench` / `random`: these CLI flags act as global defaults for all generated requests.
- For `trace`: each request can carry its own fields (e.g., `width/height/num_frames/num_inference_steps`).
Precedence rules for `trace` (i.e., what actually gets sent):
- `width/height`: if either `--width` or `--height` is explicitly set, it overrides per-request values from the trace; otherwise per-request values are used when present.
- `num_frames`: per-request `num_frames` takes precedence; otherwise fall back to `--num-frames`.
- `num_inference_steps`: per-request `num_inference_steps` takes precedence; otherwise fall back to `--num-inference-steps`.
### SLO, warmup, and max concurrency
Enable SLO evaluation with `--slo`.
- If a request in the trace already has `slo_ms`, that value is used.
- Otherwise, the script runs warmup requests to infer a base unit time, estimates `expected_ms` by linearly scaling with area/frames/steps, and then sets `slo_ms = expected_ms * --slo-scale`.
Warmup flags:
- `--warmup-requests`: Number of warmup requests.
- `--warmup-num-inference-steps`: Steps used during warmup.
- For `--task t2v`: warmup requests are forced to use `num_frames=1` to make warmup faster and less noisy.
Traffic / concurrency flags:
- `--request-rate`: Target request rate (requests/second). If set to `inf`, the script sends all requests immediately.
- `--max-concurrency`: Max number of in-flight requests (default: `1`). This can hard-cap the achieved QPS: if it is too small, requests will queue behind the semaphore, and both achieved throughput and observed SLO attainment can be skewed.
# adapted from sglang and fastvideo
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark online serving for diffusion models (Image/Video Generation).
If you want to use i2v, i2i dataset, you should `uv pip install gdown` first
Usage:
# Video
t2v:
python3 benchmarks/diffusion/diffusion_benchmark_serving.py \
--dataset vbench --task t2v --num-prompts 10 \
--height 480 --width 640 --fps 16 --num-frames 80
i2v:
python3 benchmarks/diffusion/diffusion_benchmark_serving.py \
--dataset vbench --task i2v --num-prompts 10
# Image
t2i:
python3 benchmarks/diffusion/diffusion_benchmark_serving.py \
--dataset vbench --task t2i --num-prompts 10 \
--height 1024 --width 1024
i2i:
python3 benchmarks/diffusion/diffusion_benchmark_serving.py \
--dataset vbench --task i2i --num-prompts 10
"""
import argparse
import ast
import asyncio
import base64
import glob
import json
import mimetypes
import os
import time
import uuid
from abc import ABC, abstractmethod
from collections.abc import AsyncGenerator
from dataclasses import dataclass, field, replace
from typing import Any
import aiohttp
import numpy as np
import requests
from tqdm.asyncio import tqdm
@dataclass
class RequestFuncInput:
prompt: str
api_url: str
model: str
width: int | None = None
height: int | None = None
num_frames: int | None = None
num_inference_steps: int | None = None
seed: int | None = None
fps: int | None = None
timestamp: float | None = None
slo_ms: float | None = None
extra_body: dict[str, Any] = field(default_factory=dict)
image_paths: list[str] | None = None
request_id: str = field(default_factory=lambda: str(uuid.uuid4()))
@dataclass
class RequestFuncOutput:
success: bool = False
latency: float = 0.0
error: str = ""
start_time: float = 0.0
response_body: dict[str, Any] = field(default_factory=dict)
peak_memory_mb: float = 0.0
slo_achieved: bool | None = None
class BaseDataset(ABC):
def __init__(self, args, api_url: str, model: str):
self.args = args
self.api_url = api_url
self.model = model
@abstractmethod
def __len__(self) -> int:
pass
@abstractmethod
def __getitem__(self, idx: int) -> RequestFuncInput:
pass
@abstractmethod
def get_requests(self) -> list[RequestFuncInput]:
pass
class VBenchDataset(BaseDataset):
"""
Dataset loader for VBench prompts.
Supports t2v, i2v.
"""
T2V_PROMPT_URL = (
"https://raw.githubusercontent.com/Vchitect/VBench/master/prompts/prompts_per_dimension/subject_consistency.txt"
)
I2V_DOWNLOAD_SCRIPT_URL = (
"https://raw.githubusercontent.com/Vchitect/VBench/master/vbench2_beta_i2v/download_data.sh"
)
def __init__(self, args, api_url: str, model: str):
super().__init__(args, api_url, model)
self.cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "vllm-omni")
self.items = self._load_data()
def _load_data(self) -> list[dict[str, Any]]:
if self.args.task == "t2v":
return self._load_t2v_prompts()
elif self.args.task in ["i2v", "ti2v", "ti2i", "i2i"]:
return self._load_i2v_data()
else:
return self._load_t2v_prompts()
def _download_file(self, url: str, dest_path: str) -> None:
"""Download a file from URL to destination path."""
os.makedirs(os.path.dirname(dest_path), exist_ok=True)
resp = requests.get(url)
resp.raise_for_status()
with open(dest_path, "w") as f:
f.write(resp.text)
def _load_t2v_prompts(self) -> list[dict[str, Any]]:
path = self.args.dataset_path
if not path:
path = os.path.join(self.cache_dir, "vbench_subject_consistency.txt")
if not os.path.exists(path):
print(f"Downloading VBench T2V prompts to {path}...")
try:
self._download_file(self.T2V_PROMPT_URL, path)
except Exception as e:
print(f"Failed to download VBench prompts: {e}")
return [{"prompt": "A cat sitting on a bench"}] * 50
prompts = []
with open(path) as f:
for line in f:
line = line.strip()
if line:
prompts.append({"prompt": line})
return self._resize_data(prompts)
def _auto_download_i2v_dataset(self) -> str:
"""Auto-download VBench I2V dataset and return the dataset directory."""
vbench_i2v_dir = os.path.join(self.cache_dir, "vbench_i2v", "vbench2_beta_i2v")
info_json_path = os.path.join(vbench_i2v_dir, "data", "i2v-bench-info.json")
if os.path.exists(info_json_path):
return vbench_i2v_dir
print(f"Downloading VBench I2V dataset to {vbench_i2v_dir}...")
try:
cache_root = os.path.join(self.cache_dir, "vbench_i2v")
script_path = os.path.join(cache_root, "download_data.sh")
self._download_file(self.I2V_DOWNLOAD_SCRIPT_URL, script_path)
os.chmod(script_path, 0o755)
print("Executing download_data.sh (this may take a while)...")
import subprocess
result = subprocess.run(
["bash", script_path],
cwd=cache_root,
capture_output=True,
text=True,
)
if result.returncode != 0:
raise RuntimeError(f"Download script failed: {result.stderr}")
print(f"Successfully downloaded VBench I2V dataset to {vbench_i2v_dir}")
except Exception as e:
print(f"Failed to download VBench I2V dataset: {e}")
print("Please manually download following instructions at:")
print("https://github.com/Vchitect/VBench/tree/master/vbench2_beta_i2v#22-download")
return None
return vbench_i2v_dir if os.path.exists(info_json_path) else None
def _load_from_i2v_json(self, json_path: str) -> list[dict[str, Any]]:
"""Load I2V data from i2v-bench-info.json format."""
with open(json_path) as f:
items = json.load(f)
base_dir = os.path.dirname(os.path.dirname(json_path)) # Go up to vbench2_beta_i2v
origin_dir = os.path.join(base_dir, "data", "origin")
data = []
for item in items:
img_path = os.path.join(origin_dir, item.get("file_name", ""))
if os.path.exists(img_path):
data.append({"prompt": item.get("caption", ""), "image_path": img_path})
else:
print(f"Warning: Image not found: {img_path}")
print(f"Loaded {len(data)} I2V samples from VBench I2V dataset")
return data
def _scan_directory_for_images(self, path: str) -> list[dict[str, Any]]:
"""Scan directory for image files."""
exts = ["*.jpg", "*.jpeg", "*.png", "*.webp"]
files = []
for ext in exts:
files.extend(glob.glob(os.path.join(path, ext)))
files.extend(glob.glob(os.path.join(path, ext.upper())))
# Also check in data/origin subdirectory
origin_dir = os.path.join(path, "data", "origin")
if os.path.exists(origin_dir):
files.extend(glob.glob(os.path.join(origin_dir, ext)))
files.extend(glob.glob(os.path.join(origin_dir, ext.upper())))
return [{"prompt": os.path.splitext(os.path.basename(f))[0], "image_path": f} for f in files]
def _create_dummy_data(self) -> list[dict[str, Any]]:
"""Create dummy data with a placeholder image in cache directory."""
print("No I2V data found. Using dummy placeholders.")
dummy_image = os.path.join(self.cache_dir, "dummy_image.jpg")
if not os.path.exists(dummy_image):
try:
from PIL import Image
os.makedirs(self.cache_dir, exist_ok=True)
img = Image.new("RGB", (100, 100), color="red")
img.save(dummy_image)
print(f"Created dummy image at {dummy_image}")
except ImportError:
print("PIL not installed, cannot create dummy image.")
return []
return [{"prompt": "A moving cat", "image_path": dummy_image}] * 10
def _load_i2v_data(self) -> list[dict[str, Any]]:
"""Load I2V data from VBench I2V dataset or user-provided path."""
path = self.args.dataset_path
# Auto-download if no path provided
if not path:
path = self._auto_download_i2v_dataset()
if not path:
return self._resize_data(self._create_dummy_data())
# Try to load from i2v-bench-info.json
info_json_candidates = [
os.path.join(path, "data", "i2v-bench-info.json"),
path if path.endswith(".json") else None,
]
for json_path in info_json_candidates:
if json_path and os.path.exists(json_path):
try:
return self._resize_data(self._load_from_i2v_json(json_path))
except Exception as e:
print(f"Failed to load {json_path}: {e}")
# Fallback: scan directory for images
if os.path.isdir(path):
data = self._scan_directory_for_images(path)
if data:
return self._resize_data(data)
# Last resort: dummy data
return self._resize_data(self._create_dummy_data())
def _resize_data(self, data: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""Resize data to match num_prompts."""
if not self.args.num_prompts:
return data
if len(data) < self.args.num_prompts:
factor = (self.args.num_prompts // len(data)) + 1
data = data * factor
return data[: self.args.num_prompts]
def __len__(self) -> int:
return len(self.items)
def __getitem__(self, idx: int) -> RequestFuncInput:
item = self.items[idx]
image_paths = [item["image_path"]] if "image_path" in item else None
return RequestFuncInput(
prompt=item.get("prompt", ""),
api_url=self.api_url,
model=self.model,
width=self.args.width,
height=self.args.height,
num_frames=self.args.num_frames,
num_inference_steps=self.args.num_inference_steps,
seed=self.args.seed,
fps=self.args.fps,
image_paths=image_paths,
)
def get_requests(self) -> list[RequestFuncInput]:
return [self[i] for i in range(len(self))]
class TraceDataset(BaseDataset):
"""Trace-based dataset loader for heterogeneous diffusion requests."""
DEFAULT_REPO_ID = "asukaqaqzz/Dit_Trace"
DEFAULT_FILENAME = "sd3_trace.txt"
DEFAULT_FILENAME_BY_TASK: dict[str, str] = {
# Text-to-image traces (e.g., SD3)
"t2i": "sd3_trace.txt",
# Text-to-video traces (e.g., CogVideoX)
"t2v": "cogvideox_trace.txt",
}
def __init__(self, args, api_url: str, model: str):
super().__init__(args, api_url, model)
self.cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "vllm-omni", "trace")
self.default_filename = self.DEFAULT_FILENAME_BY_TASK.get(getattr(args, "task", ""), self.DEFAULT_FILENAME)
dataset_root = args.dataset_path
if not dataset_root:
dataset_root = self._download_default_trace()
self.items = self._load_items(dataset_root)
@staticmethod
def _coerce_int(x: Any) -> int | None:
if x is None:
return None
if isinstance(x, bool):
return None
if isinstance(x, int):
return x
try:
s = str(x).strip()
if not s:
return None
return int(float(s))
except Exception:
return None
@staticmethod
def _coerce_float(x: Any) -> float | None:
if x is None:
return None
if isinstance(x, float):
return x
if isinstance(x, int):
return float(x)
try:
s = str(x).strip()
if not s:
return None
return float(s)
except Exception:
return None
def _download_default_trace(self) -> str:
"""Download default trace file from HuggingFace Hub if not provided."""
try:
from huggingface_hub import hf_hub_download
except ImportError as exc:
raise ImportError(
"huggingface_hub is required to download the default trace dataset. "
"Install via `pip install huggingface_hub`."
) from exc
os.makedirs(self.cache_dir, exist_ok=True)
return hf_hub_download(
repo_id=self.DEFAULT_REPO_ID,
filename=self.default_filename,
repo_type="dataset",
local_dir=self.cache_dir,
local_dir_use_symlinks=False,
)
def _expand_paths(self, dataset_path: str | None) -> list[str]:
if not dataset_path:
return []
parts = [p.strip() for p in str(dataset_path).split(",") if p.strip()]
paths: list[str] = []
for p in parts:
if any(ch in p for ch in ["*", "?", "["]):
paths.extend(sorted(glob.glob(p)))
elif os.path.isdir(p):
paths.extend(sorted(glob.glob(os.path.join(p, "**", "*.txt"), recursive=True)))
else:
paths.append(p)
seen = set()
unique_paths = []
for p in paths:
if p not in seen:
seen.add(p)
unique_paths.append(p)
return unique_paths
def _parse_trace_file(self, path: str) -> list[dict[str, Any]]:
rows: list[dict[str, Any]] = []
def parse_request_repr_line(line: str) -> dict[str, Any] | None:
text = line.strip()
if not text:
return None
if not (text.startswith("Request(") and text.endswith(")")):
return None
inner = text[len("Request(") : -1]
try:
expr = ast.parse(f"f({inner})", mode="eval")
if not isinstance(expr.body, ast.Call):
return None
call = expr.body
out: dict[str, Any] = {}
for kw in call.keywords:
if kw.arg is None:
continue
out[kw.arg] = ast.literal_eval(kw.value)
return out
except Exception:
return None
# detect first non-empty line to pick parser
first_non_empty = None
with open(path, encoding="utf-8") as f:
for _ in range(50):
pos = f.tell()
line = f.readline()
if not line:
break
if line.strip():
first_non_empty = line.strip()
f.seek(pos)
break
if first_non_empty is None:
return rows
if first_non_empty.startswith("Request("):
with open(path, encoding="utf-8") as f:
for line in f:
parsed = parse_request_repr_line(line)
if isinstance(parsed, dict):
rows.append(parsed)
return rows
# txt fallback: parse Request(...) lines only
with open(path, encoding="utf-8") as f:
for line in f:
parsed = parse_request_repr_line(line)
if isinstance(parsed, dict):
rows.append(parsed)
return rows
def _load_items(self, dataset_root: str) -> list[dict[str, Any]]:
paths = self._expand_paths(dataset_root)
if not paths:
raise ValueError("No trace files found. Provide --dataset-path or rely on default HuggingFace download.")
items: list[dict[str, Any]] = []
for p in paths:
if not os.path.exists(p):
continue
for row in self._parse_trace_file(p):
if isinstance(row, dict):
row = dict(row)
row.setdefault("_source", p)
items.append(row)
if not items:
raise ValueError("Trace dataset is empty after parsing provided paths.")
if self.args.num_prompts is not None:
items = items[: self.args.num_prompts]
return items
def __len__(self) -> int:
return len(self.items)
def __getitem__(self, idx: int) -> RequestFuncInput:
row = self.items[idx]
prompt = row.get("prompt") or row.get("text") or ""
row_height = self._coerce_int(row.get("height"))
row_width = self._coerce_int(row.get("width"))
num_frames = self._coerce_int(row.get("num_frames"))
num_steps = self._coerce_int(row.get("num_inference_steps"))
seed = self._coerce_int(row.get("seed"))
fps = self._coerce_int(row.get("fps"))
timestamp = self._coerce_float(row.get("timestamp"))
slo_ms = self._coerce_float(row.get("slo_ms"))
image_paths = row.get("image_paths")
override_w = self.args.width
override_h = self.args.height
if override_w is not None or override_h is not None:
width = override_w
height = override_h
else:
width = row_width
height = row_height
return RequestFuncInput(
prompt=str(prompt),
api_url=self.api_url,
model=self.model,
width=width,
height=height,
num_frames=num_frames if num_frames is not None else self.args.num_frames,
num_inference_steps=num_steps if num_steps is not None else self.args.num_inference_steps,
seed=seed if seed is not None else self.args.seed,
fps=fps if fps is not None else self.args.fps,
timestamp=timestamp,
slo_ms=slo_ms,
image_paths=image_paths,
request_id=str(row.get("request_id")) if row.get("request_id") is not None else str(uuid.uuid4()),
)
def get_requests(self) -> list[RequestFuncInput]:
return [self[i] for i in range(len(self))]
class RandomDataset(BaseDataset):
def __init__(self, args, api_url: str, model: str):
self.args = args
self.api_url = api_url
self.model = model
self.num_prompts = args.num_prompts
def __len__(self) -> int:
return self.num_prompts
def __getitem__(self, idx: int) -> RequestFuncInput:
return RequestFuncInput(
prompt=f"Random prompt {idx} for benchmarking diffusion models",
api_url=self.api_url,
model=self.model,
width=self.args.width,
height=self.args.height,
num_frames=self.args.num_frames,
num_inference_steps=self.args.num_inference_steps,
seed=self.args.seed,
fps=self.args.fps,
)
def get_requests(self) -> list[RequestFuncInput]:
return [self[i] for i in range(len(self))]
def _compute_expected_latency_ms_from_base(req: RequestFuncInput, args, base_time_ms: float | None) -> float | None:
"""Compute expected execution time (ms) based on a base per-step-per-frame unit time.
Assumes linear scaling with pixel area, frame count, and num_inference_steps.
The base unit represents latency for a 16x16 resolution, single frame, single step.
"""
if base_time_ms is None:
return None
width = req.width if req.width is not None else args.width
height = req.height if req.height is not None else args.height
if width is None or height is None:
return None
frames = req.num_frames if req.num_frames is not None else args.num_frames
steps = req.num_inference_steps if req.num_inference_steps is not None else args.num_inference_steps
frame_scale = frames if isinstance(frames, int) and frames > 0 else 1
step_scale = steps if isinstance(steps, int) and steps > 0 else 1
area_units = max((float(width) * float(height)) / float(16 * 16), 1.0)
return float(base_time_ms) * area_units * frame_scale * step_scale
def _infer_slo_base_time_ms_from_warmups(
warmup_pairs: list[tuple[RequestFuncInput, RequestFuncOutput]],
args,
) -> float | None:
"""Infer base SLO unit time from warmup requests.
Returns the median base latency (ms) for a 16x16 resolution, single-frame,
single-step request. Only uses warmups that succeeded and have resolvable
width/height.
"""
candidates_ms: list[float] = []
for req, out in warmup_pairs:
if not out.success or out.latency <= 0:
continue
width = req.width if req.width is not None else args.width
height = req.height if req.height is not None else args.height
if width is None or height is None:
continue
frames = req.num_frames if req.num_frames is not None else args.num_frames
steps = req.num_inference_steps if req.num_inference_steps is not None else args.num_inference_steps
frame_scale = int(frames) if isinstance(frames, int) and frames > 0 else 1
step_scale = int(steps) if isinstance(steps, int) and steps > 0 else 1
area_units = max((float(width) * float(height)) / float(16 * 16), 1.0)
denom = area_units * float(frame_scale) * float(step_scale)
if denom <= 0:
continue
candidates_ms.append((out.latency * 1000.0) / denom)
if not candidates_ms:
return None
return float(np.median(candidates_ms))
def _populate_slo_ms_from_warmups(
requests_list: list[RequestFuncInput],
warmup_pairs: list[tuple[RequestFuncInput, RequestFuncOutput]],
args,
) -> list[RequestFuncInput]:
"""Populate missing RequestFuncInput.slo_ms using warmup outputs.
- If a request already has slo_ms (e.g., trace-provided), it is kept as-is.
- If any request has slo_ms is None and we can infer base time from warmups,
we estimate each missing request's expected execution time and set:
req.slo_ms = expected_latency_ms * args.slo_scale
Returns updated requests_list.
"""
if not any(req.slo_ms is None for req in requests_list):
return requests_list
base_time_ms = _infer_slo_base_time_ms_from_warmups(warmup_pairs, args)
if base_time_ms is None:
return requests_list
slo_scale = float(getattr(args, "slo_scale", 3.0))
if slo_scale <= 0:
raise ValueError(f"slo_scale must be positive, got {slo_scale}.")
updated: list[RequestFuncInput] = []
for req in requests_list:
if req.slo_ms is not None:
updated.append(req)
continue
expected_ms = _compute_expected_latency_ms_from_base(req, args, base_time_ms)
updated.append(replace(req, slo_ms=(expected_ms * slo_scale) if expected_ms is not None else None))
return updated
async def iter_requests(
requests_list: list[RequestFuncInput],
request_rate: float,
) -> AsyncGenerator[RequestFuncInput, None]:
"""Yield requests using a fixed interval if request_rate is set.
- If request_rate is inf, all requests are yielded immediately (no sleep).
- Otherwise, requests are emitted at a fixed cadence of 1 / request_rate seconds.
"""
if request_rate != float("inf"):
if request_rate <= 0:
raise ValueError(f"request_rate must be positive or inf, got {request_rate}.")
interval_s = 1.0 / float(request_rate)
for i, req in enumerate(requests_list):
if request_rate != float("inf") and i > 0:
await asyncio.sleep(interval_s)
yield req
def _guess_mime_type(path: str) -> str:
mime, _ = mimetypes.guess_type(path)
return mime or "application/octet-stream"
def _encode_image_as_data_url(path: str) -> str:
with open(path, "rb") as f:
encoded = base64.b64encode(f.read()).decode("utf-8")
mime = _guess_mime_type(path)
return f"data:{mime};base64,{encoded}"
async def async_request_chat_completions(
input: RequestFuncInput,
session: aiohttp.ClientSession,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
output = RequestFuncOutput()
output.start_time = time.perf_counter()
extra_body = dict(input.extra_body)
if input.width and input.height:
extra_body.setdefault("height", input.height)
extra_body.setdefault("width", input.width)
if input.num_frames:
extra_body.setdefault("num_frames", input.num_frames)
if input.num_inference_steps:
extra_body.setdefault("num_inference_steps", input.num_inference_steps)
if input.seed is not None:
extra_body.setdefault("seed", input.seed)
if input.fps:
extra_body.setdefault("fps", input.fps)
if input.image_paths and len(input.image_paths) > 0:
content = []
if input.prompt:
content.append({"type": "text", "text": input.prompt})
for img_path in input.image_paths:
if not os.path.exists(img_path):
output.error = f"Image file not found: {img_path}"
output.success = False
if pbar:
pbar.update(1)
return output
content.append(
{
"type": "image_url",
"image_url": {"url": _encode_image_as_data_url(img_path)},
}
)
messages = [{"role": "user", "content": content}]
else:
messages = [{"role": "user", "content": input.prompt}]
payload = {
"model": input.model,
"messages": messages,
}
if extra_body:
payload["extra_body"] = extra_body
try:
async with session.post(input.api_url, json=payload) as response:
if response.status == 200:
resp_json = await response.json()
output.response_body = resp_json
output.success = True
if "peak_memory_mb" in resp_json:
output.peak_memory_mb = resp_json["peak_memory_mb"]
else:
output.error = f"HTTP {response.status}: {await response.text()}"
output.success = False
except Exception as e:
output.error = str(e)
output.success = False
output.latency = time.perf_counter() - output.start_time
if output.success and input.slo_ms is not None:
output.slo_achieved = (output.latency * 1000.0) <= float(input.slo_ms)
if pbar:
pbar.update(1)
return output
def calculate_metrics(
outputs: list[RequestFuncOutput],
total_duration: float,
requests_list: list[RequestFuncInput],
args,
slo_enabled: bool,
):
success_outputs = [o for o in outputs if o.success]
error_outputs = [o for o in outputs if not o.success]
num_success = len(success_outputs)
latencies = [o.latency for o in success_outputs]
peak_memories = [o.peak_memory_mb for o in success_outputs if o.peak_memory_mb > 0]
metrics = {
"duration": total_duration,
"completed_requests": num_success,
"failed_requests": len(error_outputs),
"throughput_qps": num_success / total_duration if total_duration > 0 else 0,
"latency_mean": np.mean(latencies) if latencies else 0,
"latency_median": np.median(latencies) if latencies else 0,
"latency_p99": np.percentile(latencies, 99) if latencies else 0,
"latency_p50": np.percentile(latencies, 50) if latencies else 0,
"peak_memory_mb_max": max(peak_memories) if peak_memories else 0,
"peak_memory_mb_mean": np.mean(peak_memories) if peak_memories else 0,
"peak_memory_mb_median": np.median(peak_memories) if peak_memories else 0,
}
if slo_enabled:
slo_defined_total = 0
slo_met_success = 0
for req, out in zip(requests_list, outputs):
if req.slo_ms is None:
continue
slo_defined_total += 1
if out.slo_achieved is None:
continue
if out.slo_achieved:
slo_met_success += 1
slo_attain_all = (slo_met_success / slo_defined_total) if slo_defined_total > 0 else 0.0
metrics.update(
{
"slo_attainment_rate": slo_attain_all,
"slo_met_success": slo_met_success,
"slo_scale": getattr(args, "slo_scale", 3.0),
}
)
return metrics
def wait_for_service(base_url: str, timeout: int = 120) -> None:
print(f"Waiting for service at {base_url}...")
start_time = time.time()
while True:
try:
# Try /health endpoint first
resp = requests.get(f"{base_url}/health", timeout=1)
if resp.status_code == 200:
print("Service is ready.")
break
except requests.exceptions.RequestException:
pass
if time.time() - start_time > timeout:
raise TimeoutError(f"Service at {base_url} did not start within {timeout} seconds.")
time.sleep(1)
async def benchmark(args):
# Construct base_url if not provided
if args.base_url is None:
args.base_url = f"http://{args.host}:{args.port}"
# Setup dataset (vLLM-Omni supports diffusion via /v1/chat/completions)
api_url = f"{args.base_url}/v1/chat/completions"
request_func = async_request_chat_completions
if args.dataset == "vbench":
dataset = VBenchDataset(args, api_url, args.model)
elif args.dataset == "trace":
dataset = TraceDataset(args, api_url, args.model)
elif args.dataset == "random":
dataset = RandomDataset(args, api_url, args.model)
else:
raise ValueError(f"Unknown dataset: {args.dataset}")
print("Loading requests...")
requests_list = dataset.get_requests()
print(f"Prepared {len(requests_list)} requests from {args.dataset} dataset.")
# Limit concurrency
if args.max_concurrency is not None:
semaphore = asyncio.Semaphore(args.max_concurrency)
else:
semaphore = None
async def limited_request_func(req, session, pbar):
if semaphore:
async with semaphore:
return await request_func(req, session, pbar)
else:
return await request_func(req, session, pbar)
# Run benchmark
pbar = tqdm(total=len(requests_list), disable=args.disable_tqdm)
async with aiohttp.ClientSession() as session:
warmup_pairs: list[tuple[RequestFuncInput, RequestFuncOutput]] = []
if args.warmup_requests and requests_list:
print(
f"Running {args.warmup_requests} warmup request(s) \
with num_inference_steps={args.warmup_num_inference_steps}..."
)
for i in range(args.warmup_requests):
warm_req = requests_list[i % len(requests_list)]
if args.warmup_num_inference_steps is not None:
warm_req = replace(
warm_req,
num_inference_steps=args.warmup_num_inference_steps,
)
warm_out = await limited_request_func(warm_req, session, None)
warmup_pairs.append((warm_req, warm_out))
if args.slo:
# Prefer trace-provided per-request slo_ms. Only populate when missing.
requests_list = _populate_slo_ms_from_warmups(
requests_list=requests_list,
warmup_pairs=warmup_pairs,
args=args,
)
start_time = time.perf_counter()
tasks = []
async for req in iter_requests(requests_list=requests_list, request_rate=args.request_rate):
task = asyncio.create_task(limited_request_func(req, session, pbar))
tasks.append(task)
outputs = await asyncio.gather(*tasks)
total_duration = time.perf_counter() - start_time
pbar.close()
# Calculate metrics
metrics = calculate_metrics(outputs, total_duration, requests_list, args, args.slo)
print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=60, c="="))
# Section 1: Configuration
print("{:<40} {:<15}".format("Model:", args.model))
print("{:<40} {:<15}".format("Dataset:", args.dataset))
print("{:<40} {:<15}".format("Task:", args.task))
# Section 2: Execution & Traffic
print(f"{'-' * 50}")
print("{:<40} {:<15.2f}".format("Benchmark duration (s):", metrics["duration"]))
print("{:<40} {:<15}".format("Request rate:", str(args.request_rate)))
print(
"{:<40} {:<15}".format(
"Max request concurrency:",
str(args.max_concurrency) if args.max_concurrency else "not set",
)
)
print("{:<40} {}/{:<15}".format("Successful requests:", metrics["completed_requests"], len(requests_list)))
# Section 3: Performance Metrics
print(f"{'-' * 50}")
print("{:<40} {:<15.2f}".format("Request throughput (req/s):", metrics["throughput_qps"]))
print("{:<40} {:<15.4f}".format("Latency Mean (s):", metrics["latency_mean"]))
print("{:<40} {:<15.4f}".format("Latency Median (s):", metrics["latency_median"]))
print("{:<40} {:<15.4f}".format("Latency P99 (s):", metrics["latency_p99"]))
if args.slo:
print(f"{'-' * 50}")
print("{:<40} {:<15.2%}".format("SLO Attainment Rate (all):", metrics.get("slo_attainment_rate", 0.0)))
print("{:<40} {:<15}".format("SLO Met (success count):", str(metrics.get("slo_met_success", 0))))
print("{:<40} {:<15}".format("SLO Scale:", str(metrics.get("slo_scale", 3.0))))
if metrics["peak_memory_mb_max"] > 0:
print(f"{'-' * 50}")
print("{:<40} {:<15.2f}".format("Peak Memory Max (MB):", metrics["peak_memory_mb_max"]))
print("{:<40} {:<15.2f}".format("Peak Memory Mean (MB):", metrics["peak_memory_mb_mean"]))
print("{:<40} {:<15.2f}".format("Peak Memory Median (MB):", metrics["peak_memory_mb_median"]))
print("\n" + "=" * 60)
if args.output_file:
with open(args.output_file, "w") as f:
json.dump(metrics, f, indent=2)
print(f"Metrics saved to {args.output_file}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Benchmark serving for diffusion models.")
parser.add_argument(
"--base-url",
type=str,
default=None,
help="Base URL of the server (e.g., http://localhost:8091). Overrides host/port.",
)
parser.add_argument("--host", type=str, default="localhost", help="Server host.")
parser.add_argument("--port", type=int, default=8091, help="Server port.")
parser.add_argument("--model", type=str, default="default", help="Model name.")
parser.add_argument(
"--dataset",
type=str,
default="vbench",
choices=["vbench", "trace", "random"],
help="Dataset to use.",
)
parser.add_argument(
"--task",
type=str,
default="t2v",
choices=["t2v", "i2v", "ti2v", "ti2i", "i2i", "t2i"],
help="Task type.",
)
parser.add_argument(
"--dataset-path",
type=str,
default=None,
help="Path to local dataset file (optional).",
)
parser.add_argument("--num-prompts", type=int, default=10, help="Number of prompts to benchmark.")
parser.add_argument(
"--max-concurrency",
type=int,
default=1,
help="Maximum number of concurrent requests, default to `1`. This can be used "
"to help simulate an environment where a higher level component "
"is enforcing a maximum number of concurrent requests. While the "
"--request-rate argument controls the rate at which requests are "
"initiated, this argument will control how many are actually allowed "
"to execute at a time. This means that when used in combination, the "
"actual request rate may be lower than specified with --request-rate, "
"if the server is not processing requests fast enough to keep up.",
)
parser.add_argument(
"--request-rate",
type=float,
default=float("inf"),
help="Number of requests per second. If this is inf, then all the requests are sent at time 0. "
"Otherwise, we use Poisson process to synthesize the request arrival times. Default is inf.",
)
parser.add_argument(
"--warmup-requests",
type=int,
default=1,
help="Number of warmup requests to run before measurement.",
)
parser.add_argument(
"--warmup-num-inference-steps",
type=int,
default=1,
help="num_inference_steps used for warmup requests.",
)
parser.add_argument("--width", type=int, default=None, help="Image/Video width.")
parser.add_argument("--height", type=int, default=None, help="Image/Video height.")
parser.add_argument("--num-frames", type=int, default=None, help="Number of frames (for video).")
parser.add_argument(
"--num-inference-steps",
type=int,
default=50,
help="Number of inference steps (for diffusion models).",
)
parser.add_argument(
"--seed",
type=int,
default=None,
help="Random seed (for diffusion models).",
)
parser.add_argument("--fps", type=int, default=None, help="FPS (for video).")
parser.add_argument("--output-file", type=str, default=None, help="Output JSON file for metrics.")
parser.add_argument(
"--slo",
action="store_true",
help=(
"Enable SLO calculation and reporting. If trace provides per-request slo_ms, it is used. "
"Otherwise, warmup request(s) are used to infer expected execution time assuming linear "
"scaling by resolution, frames, and steps, then slo_ms = expected_time * --slo-scale."
),
)
parser.add_argument(
"--slo-scale",
type=float,
default=3.0,
help="SLO target multiplier: slo_ms = estimated_exec_time_ms * slo_scale (default: 3).",
)
parser.add_argument("--disable-tqdm", action="store_true", help="Disable progress bar.")
args = parser.parse_args()
asyncio.run(benchmark(args))
# Benchmarks Guide
This README explains how to (1) prepare benchmark datasets and (2) run the provided Qwen3-Omni benchmarks.
## 1) Prepare the dataset (SeedTTS top100)
```bash
cd benchmarks/build_dataset
pip install gdown
# Download SeedTTS test set from Google Drive
gdown --id 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP
# Extract
tar -xf seedtts_testset.tar
# Copy metadata and extract top-100 prompts
cp seedtts_testset/en/meta.lst meta.lst
python extract_prompts.py -i meta.lst -o top100.txt -n 100
# (Optional) clean up to save space
rm -rf seedtts_testset seedtts_testset.tar meta.lst
```
Artifacts:
- `benchmarks/build_dataset/top100.txt` — 100 text prompts (one per line).
## 2) Run benchmarks
All commands assume repo root (`vllm-omni`).
### A. Transformers benchmark (offline, HF Transformers)
```
bash benchmarks/qwen3-omni/transformers/eval_qwen3_moe_omni_transformers.sh
```
What it does:
- Runs `qwen3_omni_moe_transformers.py` over `top100.txt` with `--num_prompts 100`.
- Outputs to `benchmarks/qwen3-omni/transformers/benchmark_results/`:
- `perf_stats.json` — aggregated & per-prompt TPS/latency (thinker/talker/code2wav/overall).
- `results.json` — per-prompt outputs and audio paths.
- `audio/` — ~100 generated `.wav` files.
Key checks:
- `overall_tps` and `*_tps_avg` should be non-zero and reasonably stable.
- Investigate any 0/NaN or unusually low TPS / long-tail latency.
### B. vLLM Omni end-to-end benchmark (pipeline)
```
bash benchmarks/qwen3-omni/vllm_omni/eval_qwen3_moe_omni.sh
```
What it does:
- Runs `examples/offline_inference/qwen3_omni/end2end.py` with `--enable-stats`.
- Uses `benchmarks/build_dataset/top100.txt` and writes to:
- Logs: `benchmarks/qwen3-omni/vllm_omni/logs/`
- `omni_llm_pipeline_text.orchestrator.stats.jsonl` — per-stage latency stats.
- `omni_llm_pipeline_text.overall.stats.jsonl` — end-to-end latency/TPS.
- `omni_llm_pipeline_text.stage{0,1,2}.log` — per-stage detailed logs/errors.
- Outputs: `benchmarks/qwen3-omni/vllm_omni/outputs/` — ~100 text and `.wav` files.
Key checks:
- Overall stats: end-to-end latency/TPS should be reasonable.
- Orchestrator stats: per-stage latency should be stable; investigate long tails.
- Stage logs: ensure no errors and no unusually slow stages.
## Performance snapshot
The chart below summarizes our measured Qwen3-Omni MoE end-to-end benchmark, comparing vLLM-Omni against HF Transformers. It shows the overall throughput advantage for vLLM-Omni. These are actual experiment results—please refer to this performance when evaluating or reproducing the benchmark.
![vLLM-Omni vs HF](./vllm-omni-vs-hf.png)
## Directory layout
- `benchmarks/build_dataset/` — dataset prep utilities (e.g., SeedTTS top100).
- `benchmarks/<model>/vllm_omni/` — vLLM-Omni pipeline benchmarks, logs, outputs.
- Add new tasks under `benchmarks/<model>/...` with the same pattern: `transformers/`, `vllm_omni/`, task-specific README, and (optionally) dataset prep notes.
- `benchmarks/<model>/vllm-omni-vs-hf.png` — current performance snapshot (overall throughput comparison).
- `benchmarks/<model>/transformers/` — HF Transformers benchmarks (offline reference).
## Troubleshooting
- Make sure GPU/driver/FlashAttention2 requirements are met for the chosen model.
- If downloads fail, confirm network access to Google Drive (`gdown`) and Hugging Face.
- If audio files are missing, check for errors in stage logs or model generation.***
#!/bin/bash
# Qwen3-Omni Transformers Benchmark Evaluation Script
# This script must be run from the vllm-omni root directory
# Get the directory where this script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Navigate to vllm-omni root directory (4 levels up from script location)
VLLM_OMNI_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
cd "$VLLM_OMNI_ROOT" || { echo "Error: Failed to navigate to vllm-omni directory"; exit 1; }
echo "Working directory: $(pwd)"
# Verify we're in the correct directory and run benchmark
if [[ ! -f "benchmarks/qwen3-omni/transformers/qwen3_omni_moe_transformers.py" ]]; then
echo "Error: Not in vllm-omni root directory. Please run from vllm-omni folder."
else
cd benchmarks/qwen3-omni/transformers
python qwen3_omni_moe_transformers.py --prompts_file ../../build_dataset/top100.txt --num_prompts 100
echo "Logs and outputs are saved to $(pwd)/benchmark_results:"
echo " - perf_stats.json Aggregated/per-prompt TPS and latency (thinker/talker/code2wav/overall)"
echo " - results.json Per-prompt outputs and audio paths"
echo " - audio/ Generated wav files, there should be 100 wav file generated"
echo "Key checks: overall_tps and *_tps_avg should be non-zero and stable; investigate 0/NaN or unusually low TPS/long-tail latency."
fi
import time
import torch
from transformers import Qwen3OmniMoeForConditionalGeneration
class Qwen3OmniMoeForConditionalGenerationWithLogging(Qwen3OmniMoeForConditionalGeneration):
@torch.no_grad()
def generate(
self,
input_ids: torch.Tensor | None = None,
speaker: str = "Ethan",
use_audio_in_video: bool = False,
return_audio: bool | None = None,
thinker_max_new_tokens: int = 1024,
thinker_eos_token_id: int = 151645,
talker_max_new_tokens: int = 4096,
talker_do_sample: bool = True,
talker_top_k: int = 50,
talker_top_p: float = 1.0,
talker_temperature: float = 0.9,
talker_repetition_penalty: float = 1.05,
**kwargs,
):
total_t0 = time.time()
perf_stats = {
"thinker_tokens": 0,
"thinker_time_s": 0.0,
"thinker_tps": 0.0,
"talker_tokens": 0,
"talker_time_s": 0.0,
"talker_tps": 0.0,
"code2wav_tokens": 0,
"code2wav_time_s": 0.0,
"code2wav_tps": 0.0,
"total_tokens": 0,
"total_time_s": 0.0,
"total_tps": 0.0,
}
if return_audio and not self.has_talker:
raise ValueError(
"Cannot use talker when talker module not initialized. "
"Use `enable_talker` method or set enable_talker in config "
"to enable talker."
)
if return_audio is None:
return_audio = self.has_talker
shared_kwargs = {"use_audio_in_video": use_audio_in_video}
thinker_kwargs = {
"max_new_tokens": thinker_max_new_tokens,
"eos_token_id": thinker_eos_token_id,
}
talker_kwargs = {}
token2wav_kwargs = {}
if return_audio:
speaker_id = self.config.talker_config.speaker_id.get(speaker.lower())
if speaker_id is None:
raise NotImplementedError(f"Speaker {speaker} not implemented")
if input_ids.shape[0] != 1:
raise NotImplementedError("Qwen3-Omni currently does not support batched inference with audio output")
talker_suppressed_tokens = [
i
for i in range(
self.config.talker_config.text_config.vocab_size - 1024,
self.config.talker_config.text_config.vocab_size,
)
if i != self.config.talker_config.codec_eos_token_id
] # Suppress additional special tokens, should not be predicted
talker_kwargs = {
"max_new_tokens": talker_max_new_tokens,
"do_sample": talker_do_sample,
"top_k": talker_top_k,
"top_p": talker_top_p,
"temperature": talker_temperature,
"eos_token_id": self.config.talker_config.codec_eos_token_id,
"repetition_penalty": talker_repetition_penalty,
"suppress_tokens": talker_suppressed_tokens,
"output_hidden_states": True,
"return_dict_in_generate": True,
}
token2wav_kwargs = {}
for key, value in kwargs.items():
if key.startswith("thinker_"):
thinker_kwargs[key[len("thinker_") :]] = value
elif key.startswith("talker_"):
talker_kwargs[key[len("talker_") :]] = value
elif key.startswith("token2wav_"):
token2wav_kwargs[key[len("token2wav_") :]] = value
# Process special input values
elif key == "feature_attention_mask":
thinker_kwargs[key] = value
talker_kwargs["audio_feature_lengths"] = torch.sum(value, dim=1)
elif key in ("input_features", "attention_mask"):
thinker_kwargs[key] = value
# Put other key to shared kwargs
else:
shared_kwargs[key] = value
# Merge kwargs
for key, value in shared_kwargs.items():
if key not in thinker_kwargs:
thinker_kwargs[key] = value
if key not in talker_kwargs and key in ["image_grid_thw", "video_grid_thw", "video_second_per_grid"]:
talker_kwargs[key] = value
if key not in token2wav_kwargs:
token2wav_kwargs[key] = value
# 1. Generate from thinker module
generate_audio = return_audio and self.has_talker
if generate_audio:
thinker_kwargs["output_hidden_states"] = True
thinker_kwargs["return_dict_in_generate"] = True
t0 = time.time()
thinker_result = self.thinker.generate(input_ids=input_ids, **thinker_kwargs)
t1 = time.time()
perf_stats["thinker_time_s"] = max(0.0, t1 - t0)
try:
prompt_len = int(input_ids.shape[1]) if input_ids is not None else 0
total_len = int(thinker_result.sequences.shape[-1])
thinker_out_len = max(0, total_len - prompt_len)
except Exception:
thinker_out_len = 0
perf_stats["thinker_tokens"] = thinker_out_len
perf_stats["thinker_tps"] = (
(thinker_out_len / perf_stats["thinker_time_s"]) if perf_stats["thinker_time_s"] > 0 else 0.0
)
if not generate_audio:
perf_stats["total_tokens"] = perf_stats["thinker_tokens"]
perf_stats["total_time_s"] = time.time() - total_t0
perf_stats["total_tps"] = (
(perf_stats["total_tokens"] / perf_stats["total_time_s"]) if perf_stats["total_time_s"] > 0 else 0.0
)
# attach stats to self
setattr(self, "_perf_stats_last", perf_stats)
if not hasattr(self, "_perf_stats_history"):
setattr(self, "_perf_stats_history", [])
self._perf_stats_history.append(perf_stats)
return thinker_result, None
# 2. Prepare talker input
thinker_embed = torch.cat([hidden_states[0] for hidden_states in thinker_result.hidden_states], dim=1).to(
self.talker.device
) # [1 t d]
thinker_hidden = torch.cat(
[
hidden_states[self.config.talker_config.accept_hidden_layer]
for hidden_states in thinker_result.hidden_states
],
dim=1,
).to(self.talker.device) # [1 t d]
im_start_indexes = torch.cat(
(
torch.nonzero(input_ids[0] == self.config.im_start_token_id).squeeze(),
torch.tensor([thinker_result.sequences.shape[-1]], device=input_ids.device, dtype=input_ids.dtype),
),
dim=-1,
).to(self.talker.device) # Shape [n_starts + 1]; Take batch 0 since batched inference is not supported here.
multimodal_mask = (
(thinker_result.sequences == self.config.thinker_config.audio_token_id) |
(thinker_result.sequences == self.config.thinker_config.image_token_id) |
(thinker_result.sequences == self.config.thinker_config.video_token_id)
).to(self.talker.device) # [1 t] # fmt: skip
talker_special_tokens = torch.tensor(
[[self.config.tts_bos_token_id, self.config.tts_eos_token_id, self.config.tts_pad_token_id]],
device=self.thinker.device,
dtype=input_ids.dtype,
)
tts_bos_embed, tts_eos_embed, tts_pad_embed = (
self.talker.text_projection(self.thinker.get_input_embeddings()(talker_special_tokens))
.to(self.talker.device)
.chunk(3, dim=1)
) # 3 * [1 1 d]
talker_input_embeds = [] # [1 t d]
talker_input_ids = []
# For every chatml parts
for i in range(len(im_start_indexes) - 1):
im_start_index = im_start_indexes[i]
segment_end_index = im_start_indexes[i + 1]
role_token = input_ids[0][im_start_index + 1]
# Talker should ignore thinker system prompt
if role_token == self.config.system_token_id:
continue
# Talker takes word embeddings for tokens and hidden state from `accept_hidden_layer` for multimodal inputs
elif role_token == self.config.user_token_id:
talker_user_part = self._get_talker_user_parts(
im_start_index, segment_end_index, multimodal_mask, thinker_hidden, thinker_embed
)
talker_input_embeds.append(talker_user_part)
talker_input_ids.append(thinker_result.sequences[:, im_start_index:segment_end_index])
# Take assistant output (for now)
elif role_token == self.config.assistant_token_id and i == len(im_start_indexes) - 2:
talker_assistant_embeds, talker_assistant_ids, trailing_text_hidden = self._get_talker_assistant_parts(
im_start_index,
segment_end_index,
speaker_id,
thinker_embed,
tts_pad_embed,
tts_bos_embed,
tts_eos_embed,
)
talker_input_embeds.append(talker_assistant_embeds)
talker_input_ids.append(talker_assistant_ids)
# History assistant output (ignore for now)
elif role_token == self.config.assistant_token_id and i != len(im_start_indexes) - 2:
continue
else:
raise AssertionError("Expect role id after <|im_start|> (assistant, user, system)")
talker_input_embed = torch.cat([embed.to(self.talker.device) for embed in talker_input_embeds], dim=1)
talker_input_id = torch.cat([embed.to(self.talker.device) for embed in talker_input_ids], dim=1)
t2 = time.time()
talker_result = self.talker.generate(
inputs_embeds=talker_input_embed,
trailing_text_hidden=trailing_text_hidden,
tts_pad_embed=tts_pad_embed,
talker_input_ids=talker_input_id, # Not use input_ids to prevent repetition penalty out of bound
**talker_kwargs,
)
t3 = time.time()
perf_stats["talker_time_s"] = max(0.0, t3 - t2)
talker_codes = (
torch.stack([hid[-1] for hid in talker_result.hidden_states if hid[-1] is not None], dim=1)
.transpose(1, 2)
.to(self.code2wav.device)
)
try:
# codes shape: (B, num_quantizers, T). We log T as token length.
perf_stats["talker_tokens"] = int(talker_codes.shape[-1])
except Exception:
perf_stats["talker_tokens"] = 0
perf_stats["talker_tps"] = (
(perf_stats["talker_tokens"] / perf_stats["talker_time_s"]) if perf_stats["talker_time_s"] > 0 else 0.0
)
t4 = time.time()
talker_wavs = self.code2wav.chunked_decode(talker_codes, chunk_size=300, left_context_size=25).float()
t5 = time.time()
perf_stats["code2wav_time_s"] = max(0.0, t5 - t4)
perf_stats["code2wav_tokens"] = perf_stats["talker_tokens"] # same T, not times 16
perf_stats["code2wav_tps"] = (
(perf_stats["code2wav_tokens"] / perf_stats["code2wav_time_s"])
if perf_stats["code2wav_time_s"] > 0
else 0.0
)
perf_stats["total_tokens"] = perf_stats["thinker_tokens"] + perf_stats["talker_tokens"]
perf_stats["total_time_s"] = time.time() - total_t0
perf_stats["total_tps"] = (
(perf_stats["total_tokens"] / perf_stats["total_time_s"]) if perf_stats["total_time_s"] > 0 else 0.0
)
setattr(self, "_perf_stats_last", perf_stats)
if not hasattr(self, "_perf_stats_history"):
setattr(self, "_perf_stats_history", [])
self._perf_stats_history.append(perf_stats)
return thinker_result, talker_wavs.float()
__all__ = [
"Qwen3OmniMoeForConditionalGenerationWithLogging",
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment