"msvc/gtest_main-md.vcproj" did not exist on "ee39a89debba2b2e00dec3fa2df03e1d3dcb4027"
Commit c1cacde6 authored by weishb's avatar weishb
Browse files

vllm-omni_0.15.0.rc1+fix1 first commit

parent 35607782
#!/bin/bash
# vllm-omni customized version
# Based on: https://github.com/vllm-project/ci-infra/blob/main/buildkite/bootstrap-amd.sh
# Last synced: 2025-12-15
# Modifications: Use local template file instead of downloading from ci-infra
set -euo pipefail
if [[ -z "${RUN_ALL:-}" ]]; then
RUN_ALL=0
fi
if [[ -z "${NIGHTLY:-}" ]]; then
NIGHTLY=0
fi
if [[ -z "${VLLM_CI_BRANCH:-}" ]]; then
VLLM_CI_BRANCH="main"
fi
if [[ -z "${AMD_MIRROR_HW:-}" ]]; then
AMD_MIRROR_HW="amdproduction"
fi
if [[ -z "${DOCS_ONLY_DISABLE:-}" ]]; then
DOCS_ONLY_DISABLE=0
fi
fail_fast() {
DISABLE_LABEL="ci-no-fail-fast"
# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm-omni/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
if [[ $PR_LABELS == *"$DISABLE_LABEL"* ]]; then
echo false
else
echo true
fi
else
echo false # not a PR or BUILDKITE_PULL_REQUEST not set
fi
}
check_run_all_label() {
RUN_ALL_LABEL="ready-run-all-tests"
# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm-omni/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
if [[ $PR_LABELS == *"$RUN_ALL_LABEL"* ]]; then
echo true
else
echo false
fi
else
echo false # not a PR or BUILDKITE_PULL_REQUEST not set
fi
}
if [[ -z "${COV_ENABLED:-}" ]]; then
COV_ENABLED=0
fi
upload_pipeline() {
echo "Uploading pipeline..."
# Install minijinja
ls .buildkite || buildkite-agent annotate --style error 'Please merge upstream main branch for buildkite CI'
curl -sSfL https://github.com/mitsuhiko/minijinja/releases/download/2.3.1/minijinja-cli-installer.sh | sh
source /var/lib/buildkite-agent/.cargo/env
if [[ $BUILDKITE_PIPELINE_SLUG == "fastcheck" ]]; then
AMD_MIRROR_HW="amdtentative"
fi
# Use local template file for vllm-omni
cp .buildkite/test-template-amd-omni.j2 .buildkite/test-template.j2
# (WIP) Use pipeline generator instead of jinja template
if [ -e ".buildkite/pipeline_generator/pipeline_generator.py" ]; then
python -m pip install click pydantic
python .buildkite/pipeline_generator/pipeline_generator.py --run_all=$RUN_ALL --list_file_diff="$LIST_FILE_DIFF" --nightly="$NIGHTLY" --mirror_hw="$AMD_MIRROR_HW"
buildkite-agent pipeline upload .buildkite/pipeline.yaml
exit 0
fi
echo "List file diff: $LIST_FILE_DIFF"
echo "Run all: $RUN_ALL"
echo "Nightly: $NIGHTLY"
echo "AMD Mirror HW: $AMD_MIRROR_HW"
FAIL_FAST=$(fail_fast)
cd .buildkite
(
set -x
# Output pipeline.yaml with all blank lines removed
minijinja-cli test-template.j2 test-amd.yaml \
-D branch="$BUILDKITE_BRANCH" \
-D list_file_diff="$LIST_FILE_DIFF" \
-D run_all="$RUN_ALL" \
-D nightly="$NIGHTLY" \
-D mirror_hw="$AMD_MIRROR_HW" \
-D fail_fast="$FAIL_FAST" \
-D vllm_use_precompiled="$VLLM_USE_PRECOMPILED" \
-D vllm_merge_base_commit="$(git merge-base origin/main HEAD)" \
-D cov_enabled="$COV_ENABLED" \
-D vllm_ci_branch="$VLLM_CI_BRANCH" \
| sed '/^[[:space:]]*$/d' \
> pipeline.yaml
)
cat pipeline.yaml
buildkite-agent artifact upload pipeline.yaml
buildkite-agent pipeline upload pipeline.yaml
exit 0
}
get_diff() {
$(git add .)
echo $(git diff --name-only --diff-filter=ACMDR $(git merge-base origin/main HEAD))
}
get_diff_main() {
$(git add .)
echo $(git diff --name-only --diff-filter=ACMDR HEAD~1)
}
file_diff=$(get_diff)
if [[ $BUILDKITE_BRANCH == "main" ]]; then
file_diff=$(get_diff_main)
fi
# ----------------------------------------------------------------------
# Early exit start: skip pipeline if conditions are met
# ----------------------------------------------------------------------
# skip pipeline if all changed files are under docs/
if [[ "${DOCS_ONLY_DISABLE}" != "1" ]]; then
if [[ -n "${file_diff:-}" ]]; then
docs_only=1
# Robust iteration over newline-separated file_diff
while IFS= read -r f; do
[[ -z "$f" ]] && continue
# **Policy:** only skip if *every* path starts with docs/
if [[ "$f" != docs/* ]]; then
docs_only=0
break
fi
done < <(printf '%s\n' "$file_diff" | tr ' ' '\n' | tr -d '\r')
if [[ "$docs_only" -eq 1 ]]; then
buildkite-agent annotate ":memo: CI skipped — docs/** only changes detected
\`\`\`
${file_diff}
\`\`\`" --style "info" || true
echo "[docs-only] All changes are under docs/. Exiting before pipeline upload."
exit 0
fi
fi
fi
# ----------------------------------------------------------------------
# Early exit end
# ----------------------------------------------------------------------
patterns=(
"docker/Dockerfile"
"CMakeLists.txt"
"requirements/common.txt"
"requirements/cuda.txt"
"requirements/build.txt"
"requirements/test.txt"
"setup.py"
"csrc/"
"cmake/"
)
ignore_patterns=(
"docker/Dockerfile."
"csrc/cpu"
"csrc/rocm"
"cmake/hipify.py"
"cmake/cpu_extension.cmake"
)
for file in $file_diff; do
# First check if file matches any pattern
matches_pattern=0
for pattern in "${patterns[@]}"; do
if [[ $file == $pattern* ]] || [[ $file == $pattern ]]; then
matches_pattern=1
break
fi
done
# If file matches pattern, check it's not in ignore patterns
if [[ $matches_pattern -eq 1 ]]; then
matches_ignore=0
for ignore in "${ignore_patterns[@]}"; do
if [[ $file == $ignore* ]] || [[ $file == $ignore ]]; then
matches_ignore=1
break
fi
done
if [[ $matches_ignore -eq 0 ]]; then
RUN_ALL=1
echo "Found changes: $file. Run all tests"
break
fi
fi
done
# Check for ready-run-all-tests label
LABEL_RUN_ALL=$(check_run_all_label)
if [[ $LABEL_RUN_ALL == true ]]; then
RUN_ALL=1
NIGHTLY=1
echo "Found 'ready-run-all-tests' label. Running all tests including optional tests."
fi
# Decide whether to use precompiled wheels
# Relies on existing patterns array as a basis.
if [[ -n "${VLLM_USE_PRECOMPILED:-}" ]]; then
echo "VLLM_USE_PRECOMPILED is already set to: $VLLM_USE_PRECOMPILED"
elif [[ $RUN_ALL -eq 1 ]]; then
export VLLM_USE_PRECOMPILED=0
echo "Detected critical changes, building wheels from source"
else
export VLLM_USE_PRECOMPILED=1
echo "No critical changes, using precompiled wheels"
fi
LIST_FILE_DIFF=$(get_diff | tr ' ' '|')
if [[ $BUILDKITE_BRANCH == "main" ]]; then
LIST_FILE_DIFF=$(get_diff_main | tr ' ' '|')
fi
upload_pipeline
steps:
- label: ":docker: Build image"
key: image-build
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker build --file docker/Dockerfile.ci -t vllm-omni-ci ."
- "docker tag vllm-omni-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT"
agents:
queue: "cpu_queue_premerge"
# - label: "Test on NPU"
# depends_on: ~
# key: npu-test
# commands:
# - ".buildkite/scripts/hardware_ci/run_npu_test.sh"
# agents:
# queue: "ascend"
- label: "Simple Unit Test"
depends_on: image-build
commands:
- pytest -v -s tests/entrypoints/
- pytest -v -s tests/diffusion/cache/
- pytest -v -s tests/diffusion/lora/
- pytest -v -s tests/model_executor/models/qwen2_5_omni/test_audio_length.py
- pytest -v -s tests/worker/
- pytest -v -s tests/distributed/omni_connectors/test_kv_flow.py
agents:
queue: "gpu_1_queue"
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"
- label: "Diffusion Model Test"
timeout_in_minutes: 20
depends_on: image-build
commands:
- pytest -s -v tests/e2e/offline_inference/test_t2i_model.py
agents:
queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"
- label: "Diffusion Images API LoRA E2E"
timeout_in_minutes: 20
depends_on: image-build
commands:
- pytest -s -v tests/e2e/online_serving/test_images_generations_lora.py
agents:
queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"
- label: "Diffusion Model CPU offloading Test"
timeout_in_minutes: 20
depends_on: image-build
commands:
- pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
- pytest -s -v tests/e2e/offline_inference/test_diffusion_layerwise_offload.py
agents:
queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"
- label: "Audio Generation Model Test"
timeout_in_minutes: 20
depends_on: image-build
commands:
- pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py
agents:
queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"
- label: "Diffusion Cache Backend Test"
timeout_in_minutes: 15
depends_on: image-build
commands:
- pytest -s -v tests/e2e/offline_inference/test_cache_dit.py tests/e2e/offline_inference/test_teacache.py
agents:
queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"
- label: "Diffusion Sequence Parallelism Test"
timeout_in_minutes: 20
depends_on: image-build
commands:
- pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py
agents:
queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
shm-size: "8gb"
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"
- label: "Diffusion Tensor Parallelism Test"
timeout_in_minutes: 20
depends_on: image-build
commands:
- pytest -s -v tests/e2e/offline_inference/test_zimage_tensor_parallel.py
agents:
queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
shm-size: "8gb"
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"
- label: "Diffusion GPU Worker Test"
timeout_in_minutes: 20
depends_on: image-build
commands:
- pytest -s -v tests/diffusion/test_diffusion_worker.py
agents:
queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
shm-size: "8gb"
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"
- label: "Benchmark Test"
timeout_in_minutes: 15
depends_on: image-build
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/benchmarks/test_serve_cli.py
agents:
queue: "mithril-h100-pool"
plugins:
- kubernetes:
podSpec:
containers:
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
resources:
limits:
nvidia.com/gpu: 2
volumeMounts:
- name: devshm
mountPath: /dev/shm
- name: hf-cache
mountPath: /root/.cache/huggingface
env:
- name: HF_HOME
value: /root/.cache/huggingface
nodeSelector:
node.kubernetes.io/instance-type: gpu-h100-sxm
volumes:
- name: devshm
emptyDir:
medium: Memory
- name: hf-cache
hostPath:
path: /mnt/hf-cache
type: DirectoryOrCreate
- label: "Omni Model Test"
timeout_in_minutes: 15
depends_on: image-build
commands:
- export VLLM_LOGGING_LEVEL=DEBUG
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
agents:
queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"
# - label: "Omni Model Test with H100"
# timeout_in_minutes: 30
# depends_on: image-build
# commands:
# - export VLLM_WORKER_MULTIPROC_METHOD=spawn
# - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
# - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
# - pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py
# - pytest -s -v tests/e2e/online_serving/test_async_omni.py
# agents:
# queue: "mithril-h100-pool"
# plugins:
# - kubernetes:
# podSpec:
# containers:
# - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
# resources:
# limits:
# nvidia.com/gpu: 2
# volumeMounts:
# - name: devshm
# mountPath: /dev/shm
# - name: hf-cache
# mountPath: /root/.cache/huggingface
# env:
# - name: HF_HOME
# value: /root/.cache/huggingface
# nodeSelector:
# node.kubernetes.io/instance-type: gpu-h100-sxm
# volumes:
# - name: devshm
# emptyDir:
# medium: Memory
# - name: hf-cache
# hostPath:
# path: /mnt/hf-cache
# type: DirectoryOrCreate
- label: "Diffusion Image Edit Test with H100 (1 GPU)"
timeout_in_minutes: 20
depends_on: image-build
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py
agents:
queue: "mithril-h100-pool"
plugins:
- kubernetes:
podSpec:
containers:
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
resources:
limits:
nvidia.com/gpu: 1
volumeMounts:
- name: devshm
mountPath: /dev/shm
- name: hf-cache
mountPath: /root/.cache/huggingface
env:
- name: HF_HOME
value: /root/.cache/huggingface
nodeSelector:
node.kubernetes.io/instance-type: gpu-h100-sxm
volumes:
- name: devshm
emptyDir:
medium: Memory
- name: hf-cache
hostPath:
path: /mnt/hf-cache
type: DirectoryOrCreate
# - label: "Bagel Text2Img Model Test with H100"
# timeout_in_minutes: 30
# depends_on: image-build
# commands:
# - export VLLM_WORKER_MULTIPROC_METHOD=spawn
# - pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py
# agents:
# queue: "mithril-h100-pool"
# plugins:
# - kubernetes:
# podSpec:
# containers:
# - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
# resources:
# limits:
# nvidia.com/gpu: 1
# volumeMounts:
# - name: devshm
# mountPath: /dev/shm
# - name: hf-cache
# mountPath: /root/.cache/huggingface
# env:
# - name: HF_HOME
# value: /root/.cache/huggingface
# nodeSelector:
# node.kubernetes.io/instance-type: gpu-h100-sxm
# volumes:
# - name: devshm
# emptyDir:
# medium: Memory
# - name: hf-cache
# hostPath:
# path: /mnt/hf-cache
# type: DirectoryOrCreate
#!/bin/bash
# Helper function to safely login to ECR Public with per-job config isolation
# Uses DOCKER_CONFIG environment variable to prevent race conditions
#
# This script prevents the "device or resource busy" error by giving each
# Buildkite job its own isolated Docker config directory.
#
# Usage:
# source docker_login_ecr_public.sh && safe_docker_login_ecr_public
set -euo pipefail
# Configuration
ECR_REGISTRY="public.ecr.aws"
setup_isolated_docker_config() {
# Use BUILDKITE_JOB_ID for job-specific isolation
# Fallback to PID if running outside Buildkite
local job_id="${BUILDKITE_JOB_ID:-$$}"
# Set Docker config to job-specific directory
export DOCKER_CONFIG="/tmp/docker-config-${job_id}"
# Create directory if it doesn't exist
mkdir -p "$DOCKER_CONFIG"
echo "[docker-config] Using isolated Docker config: $DOCKER_CONFIG"
}
check_docker_auth() {
# Check if already authenticated to the given registry
# Returns 0 if authenticated, 1 if not
local registry="$1"
# Check if credentials exist in the isolated config
if [[ -f "$DOCKER_CONFIG/config.json" ]]; then
# Check if registry is present in config
if grep -q "$registry" "$DOCKER_CONFIG/config.json" 2>/dev/null; then
return 0
fi
fi
return 1
}
safe_docker_login_ecr_public() {
# Setup isolated config first
setup_isolated_docker_config
local registry="$ECR_REGISTRY"
# Check if already authenticated (within this job)
if check_docker_auth "$registry"; then
echo "[docker-login] Already authenticated to $registry in this job"
return 0
fi
# Perform login to isolated config directory
echo "[docker-login] Logging in to $ECR_REGISTRY (isolated config)..."
if aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$ECR_REGISTRY"; then
echo "[docker-login] Login successful (config: $DOCKER_CONFIG)"
return 0
else
local exit_code=$?
echo "[docker-login] ERROR: Login failed with exit code $exit_code" >&2
return $exit_code
fi
}
# Execute if run as script (not sourced)
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
safe_docker_login_ecr_public
fi
#!/bin/bash
# vllm-omni customized version
# Based on: vllm/.buildkite/scripts/hardware_ci/run-amd-test.sh
# Last synced: 2025-12-15
# Modifications: docker image name for vllm-omni
# This script runs test inside the corresponding ROCm docker container.
set -o pipefail
# Export Python path
export PYTHONPATH=".."
# Print ROCm version
echo "--- Confirming Clean Initial State"
while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done
echo "--- ROCm info"
rocminfo
# cleanup older docker images
cleanup_docker() {
# Get Docker's root directory
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi
}
# Call the cleanup docker function
cleanup_docker
echo "--- Resetting GPUs"
echo "reset" > /opt/amdgpu/etc/gpu_state
while true; do
sleep 3
if grep -q clean /opt/amdgpu/etc/gpu_state; then
echo "GPUs state is \"clean\""
break
fi
done
echo "--- Pulling container"
image_name="public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${BUILDKITE_COMMIT}-rocm-omni"
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
# Install AWS CLI to authenticate to ECR Public Gallery to get higher rate limit for pulling images
sudo apt-get update && sudo apt-get install -y awscli
# Use safe docker login helper to prevent race conditions
source "$(dirname "${BASH_SOURCE[0]}")/../docker_login_ecr_public.sh"
safe_docker_login_ecr_public
# Pull the container from ECR Public Gallery
docker pull "${image_name}"
remove_docker_container() {
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
}
trap remove_docker_container EXIT
echo "--- Running container"
HF_CACHE="$(realpath ~)/huggingface"
mkdir -p "${HF_CACHE}"
HF_MOUNT="/root/.cache/huggingface"
commands=$@
echo "Commands:$commands"
PARALLEL_JOB_COUNT=8
MYPYTHONPATH=".."
# Test that we're launching on the machine that has
# proper access to GPUs
render_gid=$(getent group render | cut -d: -f3)
if [[ -z "$render_gid" ]]; then
echo "Error: 'render' group not found. This is required for GPU access." >&2
exit 1
fi
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
# assign job count as the number of shards used
commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
# assign shard-id for each shard
commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
echo "Shard ${GPU} commands:$commands_gpu"
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network=host \
--shm-size=16gb \
--group-add "$render_gid" \
--rm \
-e MIOPEN_DEBUG_CONV_DIRECT=0 \
-e MIOPEN_DEBUG_CONV_GEMM=0 \
-e VLLM_ROCM_USE_AITER=1 \
-e HIP_VISIBLE_DEVICES="${GPU}" \
-e HF_TOKEN \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
-e "PYTHONPATH=${MYPYTHONPATH}" \
--name "${container_name}_${GPU}" \
"${image_name}" \
/bin/bash -c "${commands_gpu}" \
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
PIDS+=($!)
done
#wait for all processes to finish and collect exit codes
for pid in "${PIDS[@]}"; do
wait "${pid}"
STATUS+=($?)
done
for st in "${STATUS[@]}"; do
if [[ ${st} -ne 0 ]]; then
echo "One of the processes failed with $st"
exit "${st}"
fi
done
else
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network=host \
--shm-size=16gb \
--group-add "$render_gid" \
--rm \
-e MIOPEN_DEBUG_CONV_DIRECT=0 \
-e MIOPEN_DEBUG_CONV_GEMM=0 \
-e VLLM_ROCM_USE_AITER=1 \
-e HF_TOKEN \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
-e "PYTHONPATH=${MYPYTHONPATH}" \
--name "${container_name}" \
"${image_name}" \
/bin/bash -c "${commands}"
fi
#!/bin/bash
# This script build the Ascend NPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Base ubuntu image with basic ascend development libraries and python installed
VLLM_OMNI_REPO="https://github.com/vllm-project/vllm-omni.git"
BASE_IMAGE_NAME="quay.nju.edu.cn/ascend/vllm-ascend:v0.11.0rc2"
image_name="npu/vllm-omni-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
# image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
echo "agent_idx: ${agent_idx}"
builder_name="cachebuilder${agent_idx}"
builder_cache_dir="/mnt/docker-cache${agent_idx}"
mkdir -p ${builder_cache_dir}
# Try building the docker image
cat <<EOF | DOCKER_BUILDKIT=1 docker build \
--add-host pypi-cache:${PYPI_CACHE_HOST} \
--builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
--cache-to type=local,dest=${builder_cache_dir},mode=max \
--build-arg BUILDKITE_PULL_REQUEST="${BUILDKITE_PULL_REQUEST}" \
--build-arg BUILDKITE_PULL_REQUEST_REPO="${BUILDKITE_PULL_REQUEST_REPO}" \
--progress=plain --load -t ${image_name} -f - .
FROM ${BASE_IMAGE_NAME}
# Define environments
ENV DEBIAN_FRONTEND=noninteractive
RUN pip config set global.index-url http://pypi-cache:${PYPI_CACHE_PORT}/pypi/simple && \
pip config set global.trusted-host pypi-cache && \
apt-get update -y && \
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
rm -rf /var/cache/apt/* && \
rm -rf /var/lib/apt/lists/*
# Install for pytest to make the docker build cache layer always valid
RUN --mount=type=cache,target=/root/.cache/pip \
pip install pytest>=6.0 pytest-cov modelscope
COPY . .
# Install vllm-omni
WORKDIR /workspace
ARG VLLM_OMNI_REPO=https://github.com/vllm-project/vllm-omni.git
ARG VLLM_OMNI_TAG=main
ARG BUILDKITE_PULL_REQUEST
ARG BUILDKITE_PULL_REQUEST_REPO
RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
if [ "\$BUILDKITE_PULL_REQUEST" != "false" ] && [ -n "\$BUILDKITE_PULL_REQUEST" ]; then \
echo "Cloning and checking out PR #\$BUILDKITE_PULL_REQUEST..." && \
git clone \$VLLM_OMNI_REPO /workspace/vllm-omni && \
cd /workspace/vllm-omni && \
git fetch origin pull/\$BUILDKITE_PULL_REQUEST/head:pr-\$BUILDKITE_PULL_REQUEST && \
git checkout pr-\$BUILDKITE_PULL_REQUEST; \
else \
echo "Not a PR build, using main branch" && \
git clone --depth 1 \$VLLM_OMNI_REPO /workspace/vllm-omni; \
fi
RUN --mount=type=cache,target=/root/.cache/pip \
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
python3 -m pip install -v -e /workspace/vllm-omni/
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
WORKDIR /workspace/vllm-omni
CMD ["/bin/bash"]
EOF
# Setup cleanup
remove_docker_container() {
docker rm -f "${container_name}" || true;
docker image rm -f "${image_name}" || true;
docker system prune -f || true;
}
trap remove_docker_container EXIT
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
# returns --device /dev/davinci0 --device /dev/davinci1
parse_and_gen_devices() {
local input="$1"
local index cards_num
if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
index="${BASH_REMATCH[1]}"
cards_num="${BASH_REMATCH[2]}"
else
echo "parse error" >&2
return 1
fi
local devices=""
local i=0
while (( i < cards_num )); do
local dev_idx=$(((index - 1)*cards_num + i ))
devices="$devices --device /dev/davinci${dev_idx}"
((i++))
done
# trim leading space
devices="${devices#"${devices%%[![:space:]]*}"}"
# Output devices: assigned to the caller variable
printf '%s' "$devices"
}
devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
# This test checks whether the OOT platform interface is functioning properly in conjunction with
# the hardware plugin vllm-ascend.
hf_model_cache_dir=/mnt/hf_cache${agent_idx}
ms_model_cache_dir=/mnt/modelscope${agent_idx}
mkdir -p ${hf_model_cache_dir}
mkdir -p ${ms_model_cache_dir}
docker run \
--init \
${devices} \
--device /dev/davinci_manager \
--device /dev/devmm_svm \
--device /dev/hisi_hdc \
-v /usr/local/dcmi:/usr/local/dcmi \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /etc/ascend_install.info:/etc/ascend_install.info \
-v ${hf_model_cache_dir}:/root/.cache/huggingface \
-v ${ms_model_cache_dir}:/root/.cache/modelscope \
--network host \
--entrypoint="" \
--name "${container_name}" \
"${image_name}" \
bash -c '
set -e
VLLM_USE_MODELSCOPE=True pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
'
steps:
- label: "Diffusion Model Test"
timeout_in_minutes: 20
agent_pool: mi325_2
depends_on: amd-build
mirror_hardwares: [amdproduction]
grade: Blocking
commands:
- export GPU_ARCHS=gfx942
- pytest -s -v tests/e2e/offline_inference/test_t2i_model.py
- label: "Diffusion Images API LoRA E2E"
timeout_in_minutes: 20
agent_pool: mi325_1
depends_on: amd-build
mirror_hardwares: [amdproduction]
grade: Blocking
commands:
- export GPU_ARCHS=gfx942
- export VLLM_LOGGING_LEVEL=DEBUG
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/e2e/online_serving/test_images_generations_lora.py
- label: "Diffusion Model CPU offloading Test"
timeout_in_minutes: 20
agent_pool: mi325_1
depends_on: amd-build
mirror_hardwares: [amdproduction]
grade: Blocking
commands:
- export GPU_ARCHS=gfx942
- export VLLM_LOGGING_LEVEL=DEBUG
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
- label: "Diffusion Cache Backend Test"
timeout_in_minutes: 15
agent_pool: mi325_1
depends_on: amd-build
mirror_hardwares: [amdproduction]
grade: Blocking
commands:
- export GPU_ARCHS=gfx942
- export VLLM_LOGGING_LEVEL=DEBUG
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/e2e/offline_inference/test_cache_dit.py tests/e2e/offline_inference/test_teacache.py
- label: "Diffusion Sequence Parallelism Test"
timeout_in_minutes: 20
agent_pool: mi325_2
depends_on: amd-build
mirror_hardwares: [amdproduction]
grade: Blocking
commands:
- export GPU_ARCHS=gfx942
- export VLLM_LOGGING_LEVEL=DEBUG
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py
- label: "Diffusion Tensor Parallelism Test"
timeout_in_minutes: 20
agent_pool: mi325_2
depends_on: amd-build
commands:
- export GPU_ARCHS=gfx942
- export VLLM_LOGGING_LEVEL=DEBUG
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/e2e/offline_inference/test_zimage_tensor_parallel.py
- label: "Diffusion GPU Worker Test"
timeout_in_minutes: 20
agent_pool: mi325_2
depends_on: amd-build
mirror_hardwares: [amdproduction]
grade: Blocking
commands:
- pytest -s -v tests/diffusion/test_diffusion_worker.py
- label: "Omni Model Test Qwen2-5-Omni"
timeout_in_minutes: 15
agent_pool: mi325_2
depends_on: amd-build
mirror_hardwares: [amdproduction]
grade: Blocking
commands:
- export GPU_ARCHS=gfx942
- export VLLM_LOGGING_LEVEL=DEBUG
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
- label: "Omni Model Test Qwen3-Omni"
timeout_in_minutes: 15
agent_pool: mi325_2
depends_on: amd-build
mirror_hardwares: [amdproduction]
grade: Blocking
commands:
- export VLLM_LOGGING_LEVEL=DEBUG
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- export VLLM_TEST_CLEAN_GPU_MEMORY="1"
- pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
- pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py
- pytest -s -v tests/e2e/online_serving/test_async_omni.py
- label: "Diffusion Image Edit Test"
timeout_in_minutes: 15
agent_pool: mi325_1
depends_on: amd-build
mirror_hardwares: [amdproduction]
grade: Blocking
commands:
- export GPU_ARCHS=gfx942
- export VLLM_LOGGING_LEVEL=DEBUG
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py
{# vllm-omni customized version
Based on: https://github.com/vllm-project/ci-infra/blob/main/buildkite/test-template-amd.j2
Last synced: 2025-12-15
Modifications: Removed unused CUDA/NVIDIA logic, keeping only AMD tests
#}
{% set docker_image_amd = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT-rocm-omni" %}
{% set default_working_dir = "/app/vllm-omni" %}
- group: "AMD Tests"
depends_on: ~
steps:
- label: "AMD: :docker: build image"
depends_on: ~
soft_fail: false
commands:
- "source .buildkite/scripts/docker_login_ecr_public.sh && safe_docker_login_ecr_public"
- "docker build -f docker/Dockerfile.rocm -t {{ docker_image_amd }} --target final --progress plain ."
- "docker push {{ docker_image_amd }}"
key: "amd-build"
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 1
- exit_status: -10 # Agent was lost
limit: 1
- exit_status: 1 # Machine occasionally fail
limit: 1
agents:
queue: cpu_queue_premerge
{% for step in steps %}
{% if step.mirror_hardwares and mirror_hw in step.mirror_hardwares %}
- label: "{{ step.agent_pool }}: {{ step.label }}"
depends_on: amd-build
agents:
{% if step.agent_pool %}
queue: amd_{{ step.agent_pool }}
{% else %}
queue: amd_mi325_1
{% endif %}
command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh "(command rocm-smi || true) && cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}"
env:
DOCKER_BUILDKIT: "1"
priority: 100
{% if step.grade and step.grade == "Blocking" %}
soft_fail: false
{% else %}
soft_fail: true
{% endif%}
{% endif %}
{% endfor %}
default_install_hook_types:
- pre-commit
- commit-msg
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v6.0.0
hooks:
# list of supported hooks: https://pre-commit.com/hooks.html
- id: check-yaml
args: ["--unsafe"]
- id: debug-statements
- id: end-of-file-fixer
- id: mixed-line-ending
args: ["--fix=lf"]
- id: trailing-whitespace
args: ["--markdown-linebreak-ext=md"]
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.14.10
hooks:
- id: ruff-check
args: [--output-format, github, --fix]
- id: ruff-format
- repo: https://github.com/crate-ci/typos
rev: typos-dict-v0.13.13
hooks:
- id: typos
# only for staged files
- repo: https://github.com/rhysd/actionlint
# v1.7.8+ sets `go 1.24.0` in go.mod, which older Go toolchains (and most
# current CI images) cannot parse. Pin to v1.7.7 until actionlint fixes the
# go.mod directive.
rev: v1.7.7
hooks:
- id: actionlint
files: ^\.github/workflows/.*\.ya?ml$
- repo: local
hooks:
- id: signoff-commit
name: Sign-off Commit
entry: bash
args:
- -c
- |
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)"
fi
language: system
verbose: true
stages: [commit-msg]
# Keep `suggestion` last
- id: suggestion
name: Suggestion
entry: bash -c 'echo "To bypass all the pre-commit hooks, add --no-verify to git commit. To skip a specific hook, prefix the commit command with SKIP=<hook-id>."'
language: system
verbose: true
pass_filenames: false
# Insert new entries above the `suggestion` entry
- id: check-pickle-imports
name: Prevent new pickle/cloudpickle imports
entry: python tools/pre_commit/check_pickle_imports.py
language: python
types: [python]
additional_dependencies: [regex]
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
version: 2
build:
os: ubuntu-22.04
tools:
python: "3.12"
jobs:
post_checkout:
- git fetch --unshallow || true
mkdocs:
configuration: mkdocs.yml
fail_on_warning: true
# Optionally declare the Python requirements required to build your docs
python:
install:
- method: pip
path: .
extra_requirements:
- docs
# Contributing to vLLM-Omni
You may find information about contributing to vLLM-Omni on [Contributing](https://vllm-omni.readthedocs.io/en/latest/contributing/)
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
# vllm-omni
<p align="center">
<picture>
<source media="(prefers-color-scheme: dark)" src="https://raw.githubusercontent.com/vllm-project/vllm-omni/refs/heads/main/docs/source/logos/vllm-omni-logo.png">
<img alt="vllm-omni" src="https://raw.githubusercontent.com/vllm-project/vllm-omni/refs/heads/main/docs/source/logos/vllm-omni-logo.png" width=55%>
</picture>
</p>
<h3 align="center">
Easy, fast, and cheap omni-modality model serving for everyone
</h3>
vLLM 最初是为支持文本生成任务的大型语言模型而设计的。vLLM-Omni 是一个框架,它将 vLLM 的支持扩展到全模态模型推理和服务的领域。
\ No newline at end of file
<p align="center">
| <a href="https://vllm-omni.readthedocs.io/en/latest/"><b>Documentation</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> | <a href="docs/assets/WeChat.jpg"><b>WeChat</b></a> |
</p>
---
*Latest News* 🔥
- [2026/02] We released [0.14.0](https://github.com/vllm-project/vllm-omni/releases/tag/v0.14.0) - This is the first **stable release** of vLLM-Omni that expands Omni’s diffusion / image-video generation and audio / TTS stack, improves distributed execution and memory efficiency, and broadens platform/backend coverage (GPU/ROCm/NPU/XPU). It also brings meaningful upgrades to serving APIs, profiling & benchmarking, and overall stability. Please check our latest [paper](https://arxiv.org/abs/2602.02204) for architecture design and performance results.
- [2026/01] We released [0.12.0rc1](https://github.com/vllm-project/vllm-omni/releases/tag/v0.12.0rc1) - a major RC milestone focused on maturing the diffusion stack, strengthening OpenAI-compatible serving, expanding omni-model coverage, and improving stability across platforms (GPU/NPU/ROCm), please check our latest [design](https://docs.google.com/presentation/d/1qv4qMW1rKAqDREMXiUDLIgqqHQe7TDPj/edit?usp=sharing&ouid=110473603432222024453&rtpof=true&sd=true).
- [2025/11] vLLM community officially released [vllm-project/vllm-omni](https://github.com/vllm-project/vllm-omni) in order to support omni-modality models serving.
---
## About
[vLLM](https://github.com/vllm-project/vllm) was originally designed to support large language models for text-based autoregressive generation tasks. vLLM-Omni is a framework that extends its support for omni-modality model inference and serving:
- **Omni-modality**: Text, image, video, and audio data processing
- **Non-autoregressive Architectures**: extend the AR support of vLLM to Diffusion Transformers (DiT) and other parallel generation models
- **Heterogeneous outputs**: from traditional text generation to multimodal outputs
<p align="center">
<picture>
<img alt="vllm-omni" src="https://raw.githubusercontent.com/vllm-project/vllm-omni/refs/heads/main/docs/source/architecture/omni-modality-model-architecture.png" width=55%>
</picture>
</p>
vLLM-Omni is fast with:
- State-of-the-art AR support by leveraging efficient KV cache management from vLLM
- Pipelined stage execution overlapping for high throughput performance
- Fully disaggregation based on OmniConnector and dynamic resource allocation across stages
vLLM-Omni is flexible and easy to use with:
- Heterogeneous pipeline abstraction to manage complex model workflows
- Seamless integration with popular Hugging Face models
- Tensor, pipeline, data and expert parallelism support for distributed inference
- Streaming outputs
- OpenAI-compatible API server
vLLM-Omni seamlessly supports most popular open-source models on HuggingFace, including:
- Omni-modality models (e.g. Qwen-Omni)
- Multi-modality generation models (e.g. Qwen-Image)
## Getting Started
Visit our [documentation](https://vllm-omni.readthedocs.io/en/latest/) to learn more.
- [Installation](https://vllm-omni.readthedocs.io/en/latest/getting_started/installation/)
- [Quickstart](https://vllm-omni.readthedocs.io/en/latest/getting_started/quickstart/)
- [List of Supported Models](https://vllm-omni.readthedocs.io/en/latest/models/supported_models/)
## Contributing
We welcome and value any contributions and collaborations.
Please check out [Contributing to vLLM-Omni](https://vllm-omni.readthedocs.io/en/latest/contributing/) for how to get involved.
## Citation
If you use vLLM-Omni for your research, please cite our [paper](https://arxiv.org/abs/2602.02204):
```bibtex
@article{yin2026vllmomni,
title={vLLM-Omni: Fully Disaggregated Serving for Any-to-Any Multimodal Models},
author={Peiqi Yin, Jiangyun Zhu, Han Gao, Chenguang Zheng, Yongxiang Huang, Taichang Zhou, Ruirui Yang, Weizhi Liu, Weiqing Chen, Canlin Guo, Didan Deng, Zifeng Mo, Cong Wang, James Cheng, Roger Wang, Hongsheng Liu},
journal={arXiv preprint arXiv:2602.02204},
year={2026}
}
```
## Join the Community
Feel free to ask questions, provide feedbacks and discuss with fellow users of vLLM-Omni in `#sig-omni` slack channel at [slack.vllm.ai](https://slack.vllm.ai) or vLLM user forum at [discuss.vllm.ai](https://discuss.vllm.ai).
## Star History
[![Star History Chart](https://api.star-history.com/svg?repos=vllm-project/vllm-omni&type=date&legend=top-left)](https://www.star-history.com/#vllm-project/vllm-omni&type=date&legend=top-left)
## License
Apache License 2.0, as found in the [LICENSE](./LICENSE) file.
# Benchmarks Overview and Architecture
This document explains the benchmark architecture across all benchmark assets in this repo. It describes what we measure, and where to find or plug in new scenarios. Per-task details remain in subfolder READMEs (e.g., `benchmarks/<model>/README.md`).
## Scope and goals
- Establish repeatable latency/throughput measurements for multimodal LLM pipelines.
- Provide both HF Transformers (offline) and vLLM-Omni (multi-stage/pipeline) baselines.
- Make it easy to plug in new datasets and models with minimal changes to the runner scripts.
## Dataset and inputs
- Default example: SeedTTS top-100 prompts (`benchmarks/build_dataset/top100.txt`) via `benchmarks/build_dataset/`.
- Extensible: drop in new prompt files or modality-aligned payloads; keep the expected format for the consuming scripts (e.g., one prompt per line).
- If you add a new dataset, document it under `benchmarks/<model>/README.md` and point scripts to your data path.
## Directory layout
- `benchmarks/build_dataset/` — dataset prep utilities (e.g., SeedTTS top100).
- `benchmarks/<model>/vllm_omni/` — vLLM-Omni pipeline benchmarks, logs, outputs.
- Add new tasks under `benchmarks/<model>/...` with the same pattern: `transformers/`, `vllm_omni/`, task-specific README, and (optionally) dataset prep notes.
## Reference workflows
- **HF Transformers (offline, single process)**
Script (example): `benchmarks/<model>/transformers/eval_qwen3_moe_omni_transformers.sh`
Outputs: `benchmark_results/perf_stats.json`, `benchmark_results/results.json`, `benchmark_results/audio/` (if audio is produced).
- **vLLM-Omni end-to-end pipeline**
Script (example): `benchmarks/<model>/vllm_omni/eval_qwen3_moe_omni.sh`
Outputs: `vllm_omni/logs/*.stats.jsonl` (per-stage/overall latency & TPS), `vllm_omni/logs/stage*.log`, `vllm_omni/outputs/` (text/audio artifacts).
- **Adding a new task/model**
1) Create `benchmarks/<model>/transformers/` and/or `benchmarks/<model>/vllm_omni/` with scripts referencing your model and dataset.
2) Add a task README describing dataset, configs, and expected outputs.
3) Keep the output/log structure similar for easy comparison (perf_stats/results/audio or text outputs; stats.jsonl/logs for pipeline).
## Metrics to watch
- **Throughput**: `overall_tps`, `*_tps_avg` per stage.
- **Latency distribution**: look for long tails in `*.stats.jsonl`.
- **Quality/completeness**: missing outputs or errors in stage logs indicate pipeline failures or misconfigurations.
## Troubleshooting
- Verify GPU/driver/FlashAttention2 requirements for your chosen model/config.
- Ensure network access for dataset/model downloads (Google Drive, Hugging Face, etc.).
- If outputs are missing or slow, inspect per-stage logs and `*.stats.jsonl` for errors, stragglers, or contention.
# Benchmark Dataset Preparation Guide
This guide describes how to download and prepare the SeedTTS test dataset for benchmarking Qwen-Omni models.
## Prerequisites
- Python 3.8+
- `gdown` for downloading from Google Drive
- Access to the benchmark scripts
## Steps
### 1. Navigate to the Dataset Directory
```bash
cd benchmarks/build_dataset
```
### 2. Install Dependencies
```bash
pip install gdown
```
### 3. Download the SeedTTS Test Dataset
Download the dataset from Google Drive:
```bash
gdown --id 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP
```
### 4. Extract the Dataset
```bash
tar -xf seedtts_testset.tar
```
### 5. Prepare the Metadata File
Copy the English metadata file to the working directory:
```bash
cp seedtts_testset/en/meta.lst meta.lst
```
### 6. Extract Prompts
Extract the first N prompts from the metadata file:
```bash
# Extract top 100 prompts (adjust -n for different amounts)
python extract_tts_prompts.py -i meta.lst -o top100.txt -n 100
```
**Options:**
- `-i, --input`: Input metadata file (default: `meta.lst`)
- `-o, --output`: Output prompts file (default: `prompts.txt`)
- `-n, --num_lines`: Number of prompts to extract (required)
### 7. Clean Up (Optional)
Remove temporary files to save disk space:
```bash
rm -rf seedtts_testset
rm seedtts_testset.tar
rm meta.lst
```
## Quick Start (All-in-One)
```bash
# Full setup and benchmark
cd benchmarks/build_dataset
pip install gdown
gdown --id 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP
tar -xf seedtts_testset.tar
cp seedtts_testset/en/meta.lst meta.lst
python extract_tts_prompts.py -i meta.lst -o top100.txt -n 100
rm -rf seedtts_testset seedtts_testset.tar meta.lst
```
#!/usr/bin/env python3
"""
Extract prompts from meta.lst and save them to a txt file.
Each line in meta.lst has the format:
ID|prompt_text|audio_path|target_text
This script extracts the prompt_text (second field) from the first N lines.
"""
import argparse
from pathlib import Path
def extract_prompts(input_file: str, output_file: str, num_lines: int) -> None:
"""
Extract prompts from meta.lst and save to output file.
Args:
input_file: Path to the meta.lst file
output_file: Path to the output txt file
num_lines: Number of lines to process
"""
prompts = []
with open(input_file, encoding="utf-8") as f:
for i, line in enumerate(f):
if i >= num_lines:
break
line = line.strip()
if not line: # Skip empty lines
continue
parts = line.split("|")
if len(parts) >= 2:
prompt = parts[1] # The prompt is the second field
prompts.append(prompt)
# Write prompts to output file
with open(output_file, "w", encoding="utf-8") as f:
for prompt in prompts:
f.write(prompt + "\n")
# Print result stats
print(f"Extracted {len(prompts)} prompts from first {num_lines} lines")
print(f"Saved to: {output_file}")
def main():
parser = argparse.ArgumentParser(description="Extract prompts from meta.lst file")
parser.add_argument(
"-i", "--input", type=str, default="meta.lst", help="Input meta.lst file path (default: meta.lst)"
)
parser.add_argument(
"-o", "--output", type=str, default="prompts.txt", help="Output txt file path (default: prompts.txt)"
)
parser.add_argument(
"-n", "--num_lines", type=int, required=True, help="Number of lines to extract from the beginning"
)
args = parser.parse_args()
# Check if input file exists
if not Path(args.input).exists():
print(f"Error: Input file '{args.input}' not found")
return
extract_prompts(args.input, args.output, args.num_lines)
if __name__ == "__main__":
main()
# Diffusion Serving Benchmark (Image/Video)
This folder contains an online-serving benchmark script for diffusion models.
It sends requests to a vLLM OpenAI-compatible endpoint and reports throughput,
latency percentiles, and optional SLO attainment.
The main entrypoint is:
- `benchmarks/diffusion/diffusion_benchmark_serving.py`
## 1. Quick Start
1. Start the server:
```bash
vllm serve Qwen/Qwen-Image --omni --port 8099
```
2. Run a minimal benchmark:
```bash
python3 benchmarks/diffusion/diffusion_benchmark_serving.py \
--base-url http://localhost:8099 \
--model Qwen/Qwen-Image \
--task t2i \
--dataset vbench \
--num-prompts 5
```
**Notes**
- The benchmark talks to `http://<host>:<port>/v1/chat/completions`.
- If you run the server on another host or port, pass `--base-url` accordingly.
## 2. Supported Datasets
The benchmark supports three dataset modes via `--dataset`:
- `vbench`: Built-in prompt/data loader.
- `trace`: Heterogeneous request traces (each request can have different resolution/frames/steps).
- `random`: Synthetic prompts for quick smoke tests.
### VBench dataset
If you use i2v/i2i bench datasets and need auto-download support, you may need:
```bash
uv pip install gdown
```
### Trace dataset
Use `--dataset trace` to replay a trace file. The trace can specify per-request fields such as:
- `width`, `height`
- `num_frames` (video)
- `num_inference_steps`
- `seed`, `fps`
- optional `slo_ms` (per-request SLO target)
By default (when `--dataset-path` is not provided), the script downloads a default trace from
the HuggingFace dataset repo `asukaqaqzz/Dit_Trace`. The default filename can depend on `--task`
(e.g., `t2v` uses a video trace).
Current defaults:
- `--task t2i` -> `sd3_trace.txt`
- `--task t2v` -> `cogvideox_trace.txt`
You can point to your own trace using `--dataset-path`.
## 3. Benchmark Parameters
### Basic flags
- `--base-url`: Server address (the script calls `.../v1/chat/completions`).
- `--model`: The OpenAI-compatible `model` field.
- `--task`: Task type (e.g., `t2i`, `t2v`, `i2i`, `i2v`).
- `--dataset`: Dataset mode (`vbench` / `trace` / `random`).
- `--num-prompts`: Number of requests to send.
Common optional flags:
- `--output-file`: Write metrics to a JSON file.
- `--disable-tqdm`: Disable the progress bar.
### Resolution / frames / steps: CLI defaults vs dataset fields
Related flags: `--width`, `--height`, `--num-frames`, `--fps`, `--num-inference-steps`.
- For `vbench` / `random`: these CLI flags act as global defaults for all generated requests.
- For `trace`: each request can carry its own fields (e.g., `width/height/num_frames/num_inference_steps`).
Precedence rules for `trace` (i.e., what actually gets sent):
- `width/height`: if either `--width` or `--height` is explicitly set, it overrides per-request values from the trace; otherwise per-request values are used when present.
- `num_frames`: per-request `num_frames` takes precedence; otherwise fall back to `--num-frames`.
- `num_inference_steps`: per-request `num_inference_steps` takes precedence; otherwise fall back to `--num-inference-steps`.
### SLO, warmup, and max concurrency
Enable SLO evaluation with `--slo`.
- If a request in the trace already has `slo_ms`, that value is used.
- Otherwise, the script runs warmup requests to infer a base unit time, estimates `expected_ms` by linearly scaling with area/frames/steps, and then sets `slo_ms = expected_ms * --slo-scale`.
Warmup flags:
- `--warmup-requests`: Number of warmup requests.
- `--warmup-num-inference-steps`: Steps used during warmup.
- For `--task t2v`: warmup requests are forced to use `num_frames=1` to make warmup faster and less noisy.
Traffic / concurrency flags:
- `--request-rate`: Target request rate (requests/second). If set to `inf`, the script sends all requests immediately.
- `--max-concurrency`: Max number of in-flight requests (default: `1`). This can hard-cap the achieved QPS: if it is too small, requests will queue behind the semaphore, and both achieved throughput and observed SLO attainment can be skewed.
This diff is collapsed.
# Benchmarks Guide
This README explains how to (1) prepare benchmark datasets and (2) run the provided Qwen3-Omni benchmarks.
## 1) Prepare the dataset (SeedTTS top100)
```bash
cd benchmarks/build_dataset
pip install gdown
# Download SeedTTS test set from Google Drive
gdown --id 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP
# Extract
tar -xf seedtts_testset.tar
# Copy metadata and extract top-100 prompts
cp seedtts_testset/en/meta.lst meta.lst
python extract_prompts.py -i meta.lst -o top100.txt -n 100
# (Optional) clean up to save space
rm -rf seedtts_testset seedtts_testset.tar meta.lst
```
Artifacts:
- `benchmarks/build_dataset/top100.txt` — 100 text prompts (one per line).
## 2) Run benchmarks
All commands assume repo root (`vllm-omni`).
### A. Transformers benchmark (offline, HF Transformers)
```
bash benchmarks/qwen3-omni/transformers/eval_qwen3_moe_omni_transformers.sh
```
What it does:
- Runs `qwen3_omni_moe_transformers.py` over `top100.txt` with `--num_prompts 100`.
- Outputs to `benchmarks/qwen3-omni/transformers/benchmark_results/`:
- `perf_stats.json` — aggregated & per-prompt TPS/latency (thinker/talker/code2wav/overall).
- `results.json` — per-prompt outputs and audio paths.
- `audio/` — ~100 generated `.wav` files.
Key checks:
- `overall_tps` and `*_tps_avg` should be non-zero and reasonably stable.
- Investigate any 0/NaN or unusually low TPS / long-tail latency.
### B. vLLM Omni end-to-end benchmark (pipeline)
```
bash benchmarks/qwen3-omni/vllm_omni/eval_qwen3_moe_omni.sh
```
What it does:
- Runs `examples/offline_inference/qwen3_omni/end2end.py` with `--enable-stats`.
- Uses `benchmarks/build_dataset/top100.txt` and writes to:
- Logs: `benchmarks/qwen3-omni/vllm_omni/logs/`
- `omni_llm_pipeline_text.orchestrator.stats.jsonl` — per-stage latency stats.
- `omni_llm_pipeline_text.overall.stats.jsonl` — end-to-end latency/TPS.
- `omni_llm_pipeline_text.stage{0,1,2}.log` — per-stage detailed logs/errors.
- Outputs: `benchmarks/qwen3-omni/vllm_omni/outputs/` — ~100 text and `.wav` files.
Key checks:
- Overall stats: end-to-end latency/TPS should be reasonable.
- Orchestrator stats: per-stage latency should be stable; investigate long tails.
- Stage logs: ensure no errors and no unusually slow stages.
## Performance snapshot
The chart below summarizes our measured Qwen3-Omni MoE end-to-end benchmark, comparing vLLM-Omni against HF Transformers. It shows the overall throughput advantage for vLLM-Omni. These are actual experiment results—please refer to this performance when evaluating or reproducing the benchmark.
![vLLM-Omni vs HF](./vllm-omni-vs-hf.png)
## Directory layout
- `benchmarks/build_dataset/` — dataset prep utilities (e.g., SeedTTS top100).
- `benchmarks/<model>/vllm_omni/` — vLLM-Omni pipeline benchmarks, logs, outputs.
- Add new tasks under `benchmarks/<model>/...` with the same pattern: `transformers/`, `vllm_omni/`, task-specific README, and (optionally) dataset prep notes.
- `benchmarks/<model>/vllm-omni-vs-hf.png` — current performance snapshot (overall throughput comparison).
- `benchmarks/<model>/transformers/` — HF Transformers benchmarks (offline reference).
## Troubleshooting
- Make sure GPU/driver/FlashAttention2 requirements are met for the chosen model.
- If downloads fail, confirm network access to Google Drive (`gdown`) and Hugging Face.
- If audio files are missing, check for errors in stage logs or model generation.***
#!/bin/bash
# Qwen3-Omni Transformers Benchmark Evaluation Script
# This script must be run from the vllm-omni root directory
# Get the directory where this script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Navigate to vllm-omni root directory (4 levels up from script location)
VLLM_OMNI_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
cd "$VLLM_OMNI_ROOT" || { echo "Error: Failed to navigate to vllm-omni directory"; exit 1; }
echo "Working directory: $(pwd)"
# Verify we're in the correct directory and run benchmark
if [[ ! -f "benchmarks/qwen3-omni/transformers/qwen3_omni_moe_transformers.py" ]]; then
echo "Error: Not in vllm-omni root directory. Please run from vllm-omni folder."
else
cd benchmarks/qwen3-omni/transformers
python qwen3_omni_moe_transformers.py --prompts_file ../../build_dataset/top100.txt --num_prompts 100
echo "Logs and outputs are saved to $(pwd)/benchmark_results:"
echo " - perf_stats.json Aggregated/per-prompt TPS and latency (thinker/talker/code2wav/overall)"
echo " - results.json Per-prompt outputs and audio paths"
echo " - audio/ Generated wav files, there should be 100 wav file generated"
echo "Key checks: overall_tps and *_tps_avg should be non-zero and stable; investigate 0/NaN or unusually low TPS/long-tail latency."
fi
import time
import torch
from transformers import Qwen3OmniMoeForConditionalGeneration
class Qwen3OmniMoeForConditionalGenerationWithLogging(Qwen3OmniMoeForConditionalGeneration):
@torch.no_grad()
def generate(
self,
input_ids: torch.Tensor | None = None,
speaker: str = "Ethan",
use_audio_in_video: bool = False,
return_audio: bool | None = None,
thinker_max_new_tokens: int = 1024,
thinker_eos_token_id: int = 151645,
talker_max_new_tokens: int = 4096,
talker_do_sample: bool = True,
talker_top_k: int = 50,
talker_top_p: float = 1.0,
talker_temperature: float = 0.9,
talker_repetition_penalty: float = 1.05,
**kwargs,
):
total_t0 = time.time()
perf_stats = {
"thinker_tokens": 0,
"thinker_time_s": 0.0,
"thinker_tps": 0.0,
"talker_tokens": 0,
"talker_time_s": 0.0,
"talker_tps": 0.0,
"code2wav_tokens": 0,
"code2wav_time_s": 0.0,
"code2wav_tps": 0.0,
"total_tokens": 0,
"total_time_s": 0.0,
"total_tps": 0.0,
}
if return_audio and not self.has_talker:
raise ValueError(
"Cannot use talker when talker module not initialized. "
"Use `enable_talker` method or set enable_talker in config "
"to enable talker."
)
if return_audio is None:
return_audio = self.has_talker
shared_kwargs = {"use_audio_in_video": use_audio_in_video}
thinker_kwargs = {
"max_new_tokens": thinker_max_new_tokens,
"eos_token_id": thinker_eos_token_id,
}
talker_kwargs = {}
token2wav_kwargs = {}
if return_audio:
speaker_id = self.config.talker_config.speaker_id.get(speaker.lower())
if speaker_id is None:
raise NotImplementedError(f"Speaker {speaker} not implemented")
if input_ids.shape[0] != 1:
raise NotImplementedError("Qwen3-Omni currently does not support batched inference with audio output")
talker_suppressed_tokens = [
i
for i in range(
self.config.talker_config.text_config.vocab_size - 1024,
self.config.talker_config.text_config.vocab_size,
)
if i != self.config.talker_config.codec_eos_token_id
] # Suppress additional special tokens, should not be predicted
talker_kwargs = {
"max_new_tokens": talker_max_new_tokens,
"do_sample": talker_do_sample,
"top_k": talker_top_k,
"top_p": talker_top_p,
"temperature": talker_temperature,
"eos_token_id": self.config.talker_config.codec_eos_token_id,
"repetition_penalty": talker_repetition_penalty,
"suppress_tokens": talker_suppressed_tokens,
"output_hidden_states": True,
"return_dict_in_generate": True,
}
token2wav_kwargs = {}
for key, value in kwargs.items():
if key.startswith("thinker_"):
thinker_kwargs[key[len("thinker_") :]] = value
elif key.startswith("talker_"):
talker_kwargs[key[len("talker_") :]] = value
elif key.startswith("token2wav_"):
token2wav_kwargs[key[len("token2wav_") :]] = value
# Process special input values
elif key == "feature_attention_mask":
thinker_kwargs[key] = value
talker_kwargs["audio_feature_lengths"] = torch.sum(value, dim=1)
elif key in ("input_features", "attention_mask"):
thinker_kwargs[key] = value
# Put other key to shared kwargs
else:
shared_kwargs[key] = value
# Merge kwargs
for key, value in shared_kwargs.items():
if key not in thinker_kwargs:
thinker_kwargs[key] = value
if key not in talker_kwargs and key in ["image_grid_thw", "video_grid_thw", "video_second_per_grid"]:
talker_kwargs[key] = value
if key not in token2wav_kwargs:
token2wav_kwargs[key] = value
# 1. Generate from thinker module
generate_audio = return_audio and self.has_talker
if generate_audio:
thinker_kwargs["output_hidden_states"] = True
thinker_kwargs["return_dict_in_generate"] = True
t0 = time.time()
thinker_result = self.thinker.generate(input_ids=input_ids, **thinker_kwargs)
t1 = time.time()
perf_stats["thinker_time_s"] = max(0.0, t1 - t0)
try:
prompt_len = int(input_ids.shape[1]) if input_ids is not None else 0
total_len = int(thinker_result.sequences.shape[-1])
thinker_out_len = max(0, total_len - prompt_len)
except Exception:
thinker_out_len = 0
perf_stats["thinker_tokens"] = thinker_out_len
perf_stats["thinker_tps"] = (
(thinker_out_len / perf_stats["thinker_time_s"]) if perf_stats["thinker_time_s"] > 0 else 0.0
)
if not generate_audio:
perf_stats["total_tokens"] = perf_stats["thinker_tokens"]
perf_stats["total_time_s"] = time.time() - total_t0
perf_stats["total_tps"] = (
(perf_stats["total_tokens"] / perf_stats["total_time_s"]) if perf_stats["total_time_s"] > 0 else 0.0
)
# attach stats to self
setattr(self, "_perf_stats_last", perf_stats)
if not hasattr(self, "_perf_stats_history"):
setattr(self, "_perf_stats_history", [])
self._perf_stats_history.append(perf_stats)
return thinker_result, None
# 2. Prepare talker input
thinker_embed = torch.cat([hidden_states[0] for hidden_states in thinker_result.hidden_states], dim=1).to(
self.talker.device
) # [1 t d]
thinker_hidden = torch.cat(
[
hidden_states[self.config.talker_config.accept_hidden_layer]
for hidden_states in thinker_result.hidden_states
],
dim=1,
).to(self.talker.device) # [1 t d]
im_start_indexes = torch.cat(
(
torch.nonzero(input_ids[0] == self.config.im_start_token_id).squeeze(),
torch.tensor([thinker_result.sequences.shape[-1]], device=input_ids.device, dtype=input_ids.dtype),
),
dim=-1,
).to(self.talker.device) # Shape [n_starts + 1]; Take batch 0 since batched inference is not supported here.
multimodal_mask = (
(thinker_result.sequences == self.config.thinker_config.audio_token_id) |
(thinker_result.sequences == self.config.thinker_config.image_token_id) |
(thinker_result.sequences == self.config.thinker_config.video_token_id)
).to(self.talker.device) # [1 t] # fmt: skip
talker_special_tokens = torch.tensor(
[[self.config.tts_bos_token_id, self.config.tts_eos_token_id, self.config.tts_pad_token_id]],
device=self.thinker.device,
dtype=input_ids.dtype,
)
tts_bos_embed, tts_eos_embed, tts_pad_embed = (
self.talker.text_projection(self.thinker.get_input_embeddings()(talker_special_tokens))
.to(self.talker.device)
.chunk(3, dim=1)
) # 3 * [1 1 d]
talker_input_embeds = [] # [1 t d]
talker_input_ids = []
# For every chatml parts
for i in range(len(im_start_indexes) - 1):
im_start_index = im_start_indexes[i]
segment_end_index = im_start_indexes[i + 1]
role_token = input_ids[0][im_start_index + 1]
# Talker should ignore thinker system prompt
if role_token == self.config.system_token_id:
continue
# Talker takes word embeddings for tokens and hidden state from `accept_hidden_layer` for multimodal inputs
elif role_token == self.config.user_token_id:
talker_user_part = self._get_talker_user_parts(
im_start_index, segment_end_index, multimodal_mask, thinker_hidden, thinker_embed
)
talker_input_embeds.append(talker_user_part)
talker_input_ids.append(thinker_result.sequences[:, im_start_index:segment_end_index])
# Take assistant output (for now)
elif role_token == self.config.assistant_token_id and i == len(im_start_indexes) - 2:
talker_assistant_embeds, talker_assistant_ids, trailing_text_hidden = self._get_talker_assistant_parts(
im_start_index,
segment_end_index,
speaker_id,
thinker_embed,
tts_pad_embed,
tts_bos_embed,
tts_eos_embed,
)
talker_input_embeds.append(talker_assistant_embeds)
talker_input_ids.append(talker_assistant_ids)
# History assistant output (ignore for now)
elif role_token == self.config.assistant_token_id and i != len(im_start_indexes) - 2:
continue
else:
raise AssertionError("Expect role id after <|im_start|> (assistant, user, system)")
talker_input_embed = torch.cat([embed.to(self.talker.device) for embed in talker_input_embeds], dim=1)
talker_input_id = torch.cat([embed.to(self.talker.device) for embed in talker_input_ids], dim=1)
t2 = time.time()
talker_result = self.talker.generate(
inputs_embeds=talker_input_embed,
trailing_text_hidden=trailing_text_hidden,
tts_pad_embed=tts_pad_embed,
talker_input_ids=talker_input_id, # Not use input_ids to prevent repetition penalty out of bound
**talker_kwargs,
)
t3 = time.time()
perf_stats["talker_time_s"] = max(0.0, t3 - t2)
talker_codes = (
torch.stack([hid[-1] for hid in talker_result.hidden_states if hid[-1] is not None], dim=1)
.transpose(1, 2)
.to(self.code2wav.device)
)
try:
# codes shape: (B, num_quantizers, T). We log T as token length.
perf_stats["talker_tokens"] = int(talker_codes.shape[-1])
except Exception:
perf_stats["talker_tokens"] = 0
perf_stats["talker_tps"] = (
(perf_stats["talker_tokens"] / perf_stats["talker_time_s"]) if perf_stats["talker_time_s"] > 0 else 0.0
)
t4 = time.time()
talker_wavs = self.code2wav.chunked_decode(talker_codes, chunk_size=300, left_context_size=25).float()
t5 = time.time()
perf_stats["code2wav_time_s"] = max(0.0, t5 - t4)
perf_stats["code2wav_tokens"] = perf_stats["talker_tokens"] # same T, not times 16
perf_stats["code2wav_tps"] = (
(perf_stats["code2wav_tokens"] / perf_stats["code2wav_time_s"])
if perf_stats["code2wav_time_s"] > 0
else 0.0
)
perf_stats["total_tokens"] = perf_stats["thinker_tokens"] + perf_stats["talker_tokens"]
perf_stats["total_time_s"] = time.time() - total_t0
perf_stats["total_tps"] = (
(perf_stats["total_tokens"] / perf_stats["total_time_s"]) if perf_stats["total_time_s"] > 0 else 0.0
)
setattr(self, "_perf_stats_last", perf_stats)
if not hasattr(self, "_perf_stats_history"):
setattr(self, "_perf_stats_history", [])
self._perf_stats_history.append(perf_stats)
return thinker_result, talker_wavs.float()
__all__ = [
"Qwen3OmniMoeForConditionalGenerationWithLogging",
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment