Merge tag 'v0.9.1' into v0.9.1-ori

cc7f22a8 · zhuwenwen · b9ea0c09 · b6553be1 · cc7f22a8 · cc7f22a8
Commit cc7f22a8 authored Jun 11, 2025 by zhuwenwen
20 changed files
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 import sys

--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import os

--- a/.buildkite/lm-eval-harness/conftest.py
+++ b/.buildkite/lm-eval-harness/conftest.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path
 import pytest

--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 LM eval harness on model to compare vs HF baseline computed offline.
 Configs are found in configs/$MODEL.yaml

--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -113,7 +113,7 @@ WARNING: The benchmarking script will save json results by itself, so please do
 ### Visualizing the results
-The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
+The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
 You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
 If you do not see the table, please wait till the benchmark finish running.
 The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.

--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 import os

--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse

--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import json

--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from lmdeploy.serve.openai.api_client import APIClient

--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import datetime
 import json

--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
 steps:
  - label: "Build wheel - CUDA 12.8"
+    id: build-wheel-cuda-12-8
    agents:
      queue: cpu_queue_postmerge
    commands:
@@ -11,6 +12,7 @@ steps:
      DOCKER_BUILDKIT: "1"
  - label: "Build wheel - CUDA 12.6"
+    id: build-wheel-cuda-12-6
    agents:
      queue: cpu_queue_postmerge
    commands:
@@ -28,6 +30,7 @@ steps:
  - label: "Build wheel - CUDA 11.8"
    # depends_on: block-build-cu118-wheel
+    id: build-wheel-cuda-11-8
    agents:
      queue: cpu_queue_postmerge
    commands:
@@ -44,6 +47,7 @@ steps:
  - label: "Build release image"
    depends_on: block-release-image-build
+    id: build-release-image
    agents:
      queue: cpu_queue_postmerge
    commands:
@@ -51,6 +55,18 @@ steps:
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+  - label: "Annotate release workflow"
+    depends_on:
+      - build-release-image
+      - build-wheel-cuda-12-8
+      - build-wheel-cuda-12-6
+      - build-wheel-cuda-11-8
+    id: annotate-release-workflow
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "bash .buildkite/scripts/annotate-release.sh"
  - label: "Build and publish TPU release image"
    depends_on: ~
    if: build.env("NIGHTLY") == "1"
@@ -70,9 +86,10 @@ steps:
      DOCKER_BUILDKIT: "1"
  - input: "Provide Release version here"
+    id: input-release-version
    fields:
      - text: "What is the release version?"
-        key: "release-version"
+        key: release-version
  - block: "Build CPU release image"
    key: block-cpu-release-image-build

--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
+#!/bin/bash
+set -ex
+# Get release version and strip leading 'v' if present
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
+if [ -z "$RELEASE_VERSION" ]; then
+  echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
+  exit 1
+fi
+buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
+To download the wheel:
+\`\`\`
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . 
+\`\`\`
+To download and upload the image:
+\`\`\`
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
+docker tag vllm/vllm-openai vllm/vllm-openai:latest
+docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
+docker push vllm/vllm-openai:latest
+docker push vllm/vllm-openai:v${RELEASE_VERSION}
+\`\`\`
+EOF 
\ No newline at end of file
--- a/.buildkite/scripts/ci-clean-log.sh
+++ b/.buildkite/scripts/ci-clean-log.sh
+#!/bin/bash
+# Usage: ./ci_clean_log.sh ci.log
+# This script strips timestamps and color codes from CI log files.
+# Check if argument is given
+if [ $# -lt 1 ]; then
+    echo "Usage: $0 ci.log"
+    exit 1
+fi
+INPUT_FILE="$1"
+# Strip timestamps
+sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
+# Strip colorization
+sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -94,6 +94,10 @@ if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
 fi
+if [[ $commands == *"pytest -v -s lora"* ]]; then
+  commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
+fi
 #ignore certain kernels tests
 if [[ $commands == *" kernels/core"* ]]; then
  commands="${commands} \

--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -7,6 +7,7 @@ set -ex
 # Setup cleanup
 remove_docker_container() {
  if [[ -n "$container_id" ]]; then
+      podman stop --all -t0
      podman rm -f "$container_id" || true
  fi
  podman system prune -f
@@ -37,7 +38,7 @@ function cpu_tests() {
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
-    pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
+    pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
 }
 # All of CPU tests are expected to be finished less than 40 mins.

--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -6,72 +6,70 @@ set -ex
 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-48-95}
+OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
+export CMAKE_BUILD_PARALLEL_LEVEL=32
 # Setup cleanup
 remove_docker_container() { 
    set -e; 
-    docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; 
+    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; 
-    docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; 
 }
 trap remove_docker_container EXIT
 remove_docker_container
 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
- --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
- --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
 function cpu_tests() {
  set -e
  export NUMA_NODE=$2
-  export BUILDKITE_BUILD_NUMBER=$3
  # offline inference
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
    set -e
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
  # Run basic model test
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
-    pytest -v -s tests/kernels/test_cache.py -m cpu_model
+    pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
-    pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
+    pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
-    pytest -v -s tests/models/decoder_only/language -m cpu_model
+    pytest -v -s tests/models/language/generation -m cpu_model
-    pytest -v -s tests/models/embedding/language -m cpu_model
+    pytest -v -s tests/models/language/pooling -m cpu_model
-    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
+    pytest -v -s tests/models/multimodal/generation \
-    pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
+                --ignore=tests/models/multimodal/generation/test_mllama.py \
-    pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
+                --ignore=tests/models/multimodal/generation/test_pixtral.py \
+                -m cpu_model"
  # Run compressed-tensor test
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v \
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
  # Run AWQ test
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
-    pytest -s -v \
+    VLLM_USE_V1=0 pytest -s -v \
    tests/quantization/test_ipex_quant.py"
  # Run chunked-prefill and prefix-cache test
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v -k cpu_model \
    tests/basic_correctness/test_chunked_prefill.py"  
  # online serving
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
-    export VLLM_CPU_KVCACHE_SPACE=10 
-    export VLLM_CPU_OMP_THREADS_BIND=$1
    python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
    timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
    python3 benchmarks/benchmark_serving.py \
@@ -83,7 +81,7 @@ function cpu_tests() {
      --tokenizer facebook/opt-125m"
  # Run multi-lora tests
-  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -s -v \
    tests/lora/test_qwen2vl.py"
@@ -91,4 +89,4 @@ function cpu_tests() {
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
+timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -2,102 +2,184 @@
 set -xu
-# Build the docker image.
-docker build -f docker/Dockerfile.tpu -t vllm-tpu .
-# Set up cleanup.
+remove_docker_container() { 
-remove_docker_container() { docker rm -f tpu-test || true; }
+    docker rm -f tpu-test || true; 
+    docker rm -f vllm-tpu || true;
+}
 trap remove_docker_container EXIT
 # Remove the container that might not be cleaned up in the previous run.
 remove_docker_container
+# Build the docker image.
+docker build -f docker/Dockerfile.tpu -t vllm-tpu .
+# Set up cleanup.
+cleanup_docker() {
+  # Get Docker's root directory
+  docker_root=$(docker info -f '{{.DockerRootDir}}')
+  if [ -z "$docker_root" ]; then
+    echo "Failed to determine Docker root directory."
+    exit 1
+  fi
+  echo "Docker root directory: $docker_root"
+  # Check disk usage of the filesystem where Docker's root directory is located
+  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+  # Define the threshold
+  threshold=70
+  if [ "$disk_usage" -gt "$threshold" ]; then
+    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+    # Remove dangling images (those that are not tagged and not used by any container)
+    docker image prune -f
+    # Remove unused volumes / force the system prune for old images as well.
+    docker volume prune -f && docker system prune --force --filter "until=72h" --all
+    echo "Docker images and volumes cleanup completed."
+  else
+    echo "Disk usage is below $threshold%. No cleanup needed."
+  fi
+}
+cleanup_docker
 # For HF_TOKEN.
 source /etc/environment
-# Run a simple end-to-end example.
 docker run --privileged --net host --shm-size=16G -it \
    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
-    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
+    vllm-tpu /bin/bash -c '
-    && python3 -m pip install pytest pytest-asyncio tpu-info \
+set -e # Exit immediately if a command exits with a non-zero status.
-    && python3 -m pip install lm_eval[api]==0.4.4 \
+set -u # Treat unset variables as an error.
-    && export VLLM_XLA_CACHE_PATH= \
-    && export VLLM_USE_V1=1 \
+echo "--- Starting script inside Docker container ---"
-    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
-    && echo HARDWARE \
+# Create results directory
-    && tpu-info \
+RESULTS_DIR=$(mktemp -d)
-    && { \
+# If mktemp fails, set -e will cause the script to exit.
-        echo TEST_0: Running test_perf.py; \
+echo "Results will be stored in: $RESULTS_DIR"
-        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
-        echo TEST_0_EXIT_CODE: \$?; \
+# Install dependencies
-    } & \
+echo "--- Installing Python dependencies ---"
-    { \
+python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
-        echo TEST_1: Running test_compilation.py; \
+    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
+    && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
-        echo TEST_1_EXIT_CODE: \$?; \
+echo "--- Python dependencies installed ---"
-    } & \
+export VLLM_USE_V1=1
-    { \
+export VLLM_XLA_CHECK_RECOMPILATION=1
-        echo TEST_2: Running test_basic.py; \
+export VLLM_XLA_CACHE_PATH=
-        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
+echo "Using VLLM V1"
-        echo TEST_2_EXIT_CODE: \$?; \
-    } & \
+echo "--- Hardware Information ---"
-    { \
+tpu-info
-        echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
+echo "--- Starting Tests ---"
-        python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
+set +e
-        echo TEST_3_EXIT_CODE: \$?; \
+overall_script_exit_code=0
-    } & \
-    { \
+# --- Test Definitions ---
-        echo TEST_4: Running test_quantization_accuracy.py; \
+# If a test fails, this function will print logs and will not cause the main script to exit.
-        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
+run_test() {
-        echo TEST_4_EXIT_CODE: \$?; \
+    local test_num=$1
-    } & \
+    local test_name=$2
-    { \
+    local test_command=$3
-        echo TEST_5: Running examples/offline_inference/tpu.py; \
+    local log_file="$RESULTS_DIR/test_${test_num}.log"
-        python3 /workspace/vllm/examples/offline_inference/tpu.py; \
+    local actual_exit_code
-        echo TEST_5_EXIT_CODE: \$?; \
-    } & \
+    echo "--- TEST_$test_num: Running $test_name ---"
-    { \
-        echo TEST_6: Running test_tpu_model_runner.py; \
+    # Execute the test command.
-        python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
+    eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
-        echo TEST_6_EXIT_CODE: \$?; \
+    actual_exit_code=$?
-    } & \
-    { \
+    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
-        echo TEST_7: Running test_sampler.py; \
+    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
-        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
-        echo TEST_7_EXIT_CODE: \$?; \
+    if [ "$actual_exit_code" -ne 0 ]; then
-    } & \
+        echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
-    { \
+        echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
-        echo TEST_8: Running test_topk_topp_sampler.py; \
+        if [ -f "$log_file" ]; then
-        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
+            cat "$log_file" >&2
-        echo TEST_8_EXIT_CODE: \$?; \
+        else
-    } & \
+            echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
-    { \
+        fi
-        echo TEST_9: Running test_multimodal.py; \
+        echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
-        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
+        return "$actual_exit_code" # Return the failure code
-        echo TEST_9_EXIT_CODE: \$?; \
+    else
-    } & \
+        echo "TEST_$test_num ($test_name) PASSED."
-    { \
+        return 0 # Return success
-        echo TEST_10: Running test_pallas.py; \
+    fi
-        python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
+}
-        echo TEST_10_EXIT_CODE: \$?; \
-    } & \
+# Helper function to call run_test and update the overall script exit code
-    { \
+run_and_track_test() {
-        echo TEST_11: Running test_struct_output_generate.py; \
+    local test_num_arg="$1"
-        python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
+    local test_name_arg="$2"
-        echo TEST_11_EXIT_CODE: \$?; \
+    local test_command_arg="$3"
-    } & \
-    { \
+    # Run the test
-        echo TEST_12: Running test_moe_pallas.py; \
+    run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
-        python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
+    local test_specific_exit_code=$?
-        echo TEST_12_EXIT_CODE: \$?; \
-    } & \
+    # If the test failed, set the overall script exit code to 1
-    # Disable the TPU LoRA tests until the feature is activated
+    if [ "$test_specific_exit_code" -ne 0 ]; then
-    # & { \
+        # No need for extra echo here, run_test already logged the failure.
-    #     echo TEST_13: Running test_moe_pallas.py; \
+        overall_script_exit_code=1
-    #     python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \
+    fi
-    #     echo TEST_13_EXIT_CODE: \$?; \
+}
-    # } & \
-    wait \
+# --- Actual Test Execution ---
-    && echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \
+run_and_track_test 0 "test_perf.py" \
-"
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
+run_and_track_test 1 "test_compilation.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
+run_and_track_test 2 "test_basic.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
+run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
+run_and_track_test 4 "test_quantization_accuracy.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
+run_and_track_test 5 "examples/offline_inference/tpu.py" \
+    "python3 /workspace/vllm/examples/offline_inference/tpu.py"
+run_and_track_test 6 "test_tpu_model_runner.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
+run_and_track_test 7 "test_sampler.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
+run_and_track_test 8 "test_topk_topp_sampler.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
+run_and_track_test 9 "test_multimodal.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
+run_and_track_test 10 "test_pallas.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
+run_and_track_test 11 "test_struct_output_generate.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
+run_and_track_test 12 "test_moe_pallas.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
+run_and_track_test 13 "test_lora.py" \
+    "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
+run_and_track_test 14 "test_tpu_qkv_linear.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
+run_and_track_test 15 "test_spmd_model_weight_loading.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
+# After all tests have been attempted, exit with the overall status.
+if [ "$overall_script_exit_code" -ne 0 ]; then
+    echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
+else
+    echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
+fi
+exit "$overall_script_exit_code"
+' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
+# Capture the exit code of the docker run command
+DOCKER_RUN_EXIT_CODE=$?
+# The trap will run for cleanup.
+# Exit the main script with the Docker run command's exit code.
+if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
+    echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
+    exit "$DOCKER_RUN_EXIT_CODE"
+else
+    echo "Docker run command completed successfully."
+    exit 0
+fi
 # TODO: This test fails because it uses RANDOM_SEED sampling
-# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
--- a/.buildkite/scripts/rerun-test.sh
+++ b/.buildkite/scripts/rerun-test.sh
+#!/bin/bash
+# Usage: ./rerun_test.sh path/to/test.py::test_name
+# Check if argument is given
+if [ $# -lt 1 ]; then
+    echo "Usage: $0 path/to/test.py::test_name"
+    echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
+    exit 1
+fi
+TEST=$1
+COUNT=1
+while pytest -sv "$TEST"; do
+    COUNT=$((COUNT + 1))
+    echo "RUN NUMBER ${COUNT}"
+done
--- a/.buildkite/scripts/tpu/cleanup_docker.sh
+++ b/.buildkite/scripts/tpu/cleanup_docker.sh
+#!/bin/bash
+set -euo pipefail
+docker_root=$(docker info -f '{{.DockerRootDir}}')
+if [ -z "$docker_root" ]; then
+  echo "Failed to determine Docker root directory."
+  exit 1
+fi
+echo "Docker root directory: $docker_root"
+# Check disk usage of the filesystem where Docker's root directory is located
+disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+# Define the threshold
+threshold=70
+if [ "$disk_usage" -gt "$threshold" ]; then
+  echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+  # Remove dangling images (those that are not tagged and not used by any container)
+  docker image prune -f
+  # Remove unused volumes / force the system prune for old images as well.
+  docker volume prune -f && docker system prune --force --filter "until=72h" --all
+  echo "Docker images and volumes cleanup completed."
+else
+  echo "Disk usage is below $threshold%. No cleanup needed."
+fi
--- a/.buildkite/scripts/tpu/config_v6e_1.env
+++ b/.buildkite/scripts/tpu/config_v6e_1.env
+# Environment config
+TEST_NAME=llama8b
+CONTAINER_NAME=vllm-tpu
+# vllm config
+MODEL=meta-llama/Llama-3.1-8B-Instruct
+MAX_NUM_SEQS=512
+MAX_NUM_BATCHED_TOKENS=512
+TENSOR_PARALLEL_SIZE=1
+MAX_MODEL_LEN=2048
+DOWNLOAD_DIR=/mnt/disks/persist
+EXPECTED_THROUGHPUT=8.0
+INPUT_LEN=1800
+OUTPUT_LEN=128