Merge tag 'v0.18.0' into v0.18.0-ori

3fb4b5fa · zhuwenwen · bcf25339 · 89138b21 · 3fb4b5fa · 3fb4b5fa
Commit 3fb4b5fa authored Mar 23, 2026 by zhuwenwen
20 changed files
--- a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
@@ -10,7 +10,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -37,7 +36,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -64,7 +62,6 @@
        "server_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -78,5 +75,83 @@
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
+    },
+    {
+        "test_name": "serving_deepseek_r1",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "server_parameters": {
+            "model": "deepseek-ai/DeepSeek-R1",
+            "tensor_parallel_size": 8,
+            "disable_log_stats": "",
+            "load_format": "dummy",
+            "max-model-len": 2048,
+            "max-num-seqs": 200,
+            "async-scheduling": "",
+            "dtype": "bfloat16"
+        },
+        "client_parameters": {
+            "model": "deepseek-ai/DeepSeek-R1",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama4_maverick_17b128e_instruct_fp8",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+            "tensor_parallel_size": 8,
+            "disable_log_stats": "",
+            "max-model-len": 2048,
+            "max-num-seqs": 128,
+            "async-scheduling": "",
+            "enable_expert_parallel": "",
+            "max-num-batched-tokens": 4096
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_qwen3_8b",
+        "qps_list": [1, 4, 10, "inf"],
+        "server_environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "server_parameters": {
+            "model": "Qwen/Qwen-3-8B",
+            "tensor_parallel_size": 1,
+            "dtype": "bfloat16",
+            "disable_log_stats": "",
+            "async-scheduling": ""
+        },
+        "client_parameters": {
+            "model": "Qwen/Qwen-3-8B",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
    }
 ]
--- a/.buildkite/performance-benchmarks/tests/serving-tests.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests.json
@@ -5,7 +5,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -23,7 +22,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -41,7 +39,6 @@
        "server_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -59,7 +56,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", 
            "tensor_parallel_size": 4,
-            "swap_space": 16,
            "speculative_config": {
                "model": "turboderp/Qwama-0.5B-Instruct",
                "num_speculative_tokens": 4,

--- a/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
@@ -57,5 +57,67 @@
            "max-num-seqs": 512,
            "async-scheduling": ""
        }
+    },
+    {
+        "test_name": "throughput_deepseek_r1",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "deepseek-ai/DeepSeek-R1",
+            "tensor_parallel_size": 8,
+            "load_format": "dummy",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "dataset_name": "sharegpt",
+            "num_prompts": 1000,
+            "backend": "vllm",
+            "max-model-len": 2048,
+            "max-num-seqs": 384,
+            "async-scheduling": ""
+        }
+    },
+    {
+        "test_name": "throughput_llama4_maverick_17b128e_instruct_fp8",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+            "tensor_parallel_size": 8,
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "dataset_name": "sharegpt",
+            "num_prompts": 1000,
+            "backend": "vllm",
+            "max-model-len": 2048,
+            "max-num-seqs": 512,
+            "async-scheduling": "",
+            "enable_expert_parallel": ""
+        }
+    },
+    {
+        "test_name": "throughput_qwen3_8b",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "Qwen/Qwen-3-8B",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "dataset_name": "sharegpt",
+            "num_prompts": 1000,
+            "max-num-seqs": 512,
+            "backend": "vllm",
+            "async-scheduling": ""
+        }
    }
 ]
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -83,7 +83,7 @@ steps:
        agents:
          queue: cpu_queue_postmerge
        commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
          - "mkdir artifacts"
          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
@@ -152,7 +152,7 @@ steps:
          queue: cpu_queue_postmerge
        commands:
          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
        env:

--- a/.buildkite/scripts/annotate-rocm-release.sh
+++ b/.buildkite/scripts/annotate-rocm-release.sh
@@ -25,7 +25,7 @@ S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
 S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"

 # Format ROCm version for path (e.g., "7.1" -> "rocm710")
-ROCM_VERSION_PATH="rocm$(echo ${ROCM_VERSION} | tr -d '.')"
+ROCM_VERSION_PATH="rocm$(echo "${ROCM_VERSION}" | tr -d '.')"
 ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
 buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
 ## ROCm Wheel and Docker Image Releases
@@ -68,7 +68,7 @@ aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl .
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl .
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/aiter-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amd_aiter-*.whl .
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl .
 \`\`\`

@@ -80,7 +80,7 @@ aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-
 - **torchvision**: TorchVision for ROCm PyTorch
 - **torchaudio**: Torchaudio for ROCm PyTorch
 - **amdsmi**: AMD SMI Python bindings
- **aiter**: Aiter for ROCm
+- **amd_aiter**: Aiter for ROCm
 - **flash-attn**: Flash Attention for ROCm

 ### :warning: Notes

--- a/.buildkite/scripts/cache-rocm-base-wheels.sh
+++ b/.buildkite/scripts/cache-rocm-base-wheels.sh
@@ -83,7 +83,7 @@ case "${1:-}" in
            exit 1
        fi

-        WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
+        WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
        if [[ "$WHEEL_COUNT" -eq 0 ]]; then
            echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
            exit 1
@@ -110,9 +110,9 @@ case "${1:-}" in

        echo ""
        echo "Downloaded wheels:"
-        ls -lh artifacts/rocm-base-wheels/
+        find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' -exec ls -lh {} \;

-        WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
+        WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
        echo ""
        echo "Total: $WHEEL_COUNT wheels"
        echo "========================================"

--- a/.buildkite/scripts/check-ray-compatibility.sh
+++ b/.buildkite/scripts/check-ray-compatibility.sh
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Check if Ray LLM can generate lock files that are compatible with this
+# version of vllm. Downloads Ray's requirement files and runs a full
+# dependency resolution with the installed vllm's constraints to see if
+# a valid lock file can be produced.
+#
+# See: https://github.com/vllm-project/vllm/issues/33599
+
+set -eo pipefail
+
+RAY_BASE_URL="https://raw.githubusercontent.com/ray-project/ray/master/python"
+
+WORK_DIR=$(mktemp -d)
+trap 'rm -rf "$WORK_DIR"' EXIT
+
+# Fetch all Ray requirement files used in the LLM depset pipeline
+echo ">>> Fetching Ray requirement files"
+RAY_FILES=(
+    "requirements.txt"
+    "requirements/cloud-requirements.txt"
+    "requirements/base-test-requirements.txt"
+    "requirements/llm/llm-requirements.txt"
+    "requirements/llm/llm-test-requirements.txt"
+)
+for FILE in "${RAY_FILES[@]}"; do
+    LOCAL_PATH="${WORK_DIR}/$(basename "$FILE")"
+    echo "    ${FILE}"
+    curl -fsSL -o "$LOCAL_PATH" "${RAY_BASE_URL}/${FILE}"
+done
+
+# Extract installed vllm deps
+echo ">>> Extracting installed vllm dependency constraints"
+python3 - "${WORK_DIR}/vllm-constraints.txt" <<'PYEOF'
+"""Write out the installed vllm's dependencies as pip constraint lines.
+
+Ray uses vllm[audio], so audio-extra deps are included with their extra
+markers stripped. The resolver cannot evaluate extra markers for a
+package that is not itself being resolved from an index, so we activate
+them manually here.
+"""
+import importlib.metadata
+import re
+import sys
+
+out_path = sys.argv[1]
+raw_reqs = importlib.metadata.requires("vllm") or []
+
+# Ray uses vllm[audio] – activate that extra.
+ACTIVE_EXTRAS = {"audio"}
+EXTRA_RE = re.compile(r"""extra\s*==\s*['"]([^'"]+)['"]""")
+
+lines = []
+for r in raw_reqs:
+    if ";" not in r:
+        # Unconditional dep — always include.
+        lines.append(r.strip())
+        continue
+
+    req_part, _, marker_part = r.partition(";")
+    marker_part = marker_part.strip()
+
+    extra_matches = EXTRA_RE.findall(marker_part)
+    if not extra_matches:
+        # Non-extra marker (python_version, etc.) — keep as-is.
+        lines.append(r.strip())
+        continue
+
+    if not ACTIVE_EXTRAS.intersection(extra_matches):
+        continue  # Skip inactive extras (tensorizer, bench, …).
+
+    # Strip the extra== conditions but keep any remaining markers
+    # (e.g. python_version).
+    cleaned = EXTRA_RE.sub("", marker_part)
+    cleaned = re.sub(r"\band\b\s*\band\b", "and", cleaned)
+    cleaned = re.sub(r"^\s*and\s+|\s+and\s*$", "", cleaned).strip()
+
+    if cleaned:
+        lines.append(f"{req_part.strip()} ; {cleaned}")
+    else:
+        lines.append(req_part.strip())
+
+with open(out_path, "w") as f:
+    for line in lines:
+        f.write(line + "\n")
+
+print(f"Wrote {len(lines)} constraints to {out_path}")
+PYEOF
+
+echo ">>> Installed vllm deps (first 20 lines):"
+head -20 "${WORK_DIR}/vllm-constraints.txt"
+
+# Remove Ray's vllm pin — the installed vllm's transitive deps
+# (written above) replace it in the resolution. vllm itself cannot
+# be resolved from PyPI for in-development versions, so we test
+# whether Ray's requirements can coexist with vllm's dependency
+# constraints instead.
+sed -i '/^vllm/d' "${WORK_DIR}/llm-requirements.txt"
+
+# Install uv if needed
+if ! command -v uv &>/dev/null; then
+    echo ">>> Installing uv"
+    pip install uv -q
+fi
+
+# Resolve: given vllm's constraints, can Ray compile a lock file?
+#
+# vllm's dependency constraints are the fixed side — Ray is flexible and
+# can regenerate its lock files. We pass vllm's constraints via -c so
+# the resolver treats them as non-negotiable bounds, then check whether
+# Ray's own requirements can still be satisfied within those bounds.
+echo ""
+echo "============================================================"
+echo ">>> Resolving: Can Ray generate compatible lock files?"
+echo "============================================================"
+
+set +e
+uv pip compile \
+    "${WORK_DIR}/requirements.txt" \
+    "${WORK_DIR}/cloud-requirements.txt" \
+    "${WORK_DIR}/base-test-requirements.txt" \
+    "${WORK_DIR}/llm-requirements.txt" \
+    "${WORK_DIR}/llm-test-requirements.txt" \
+    -c "${WORK_DIR}/vllm-constraints.txt" \
+    --python-version 3.12 \
+    --python-platform x86_64-manylinux_2_31 \
+    --extra-index-url https://download.pytorch.org/whl/cu129 \
+    --index-strategy unsafe-best-match \
+    --unsafe-package setuptools \
+    --unsafe-package ray \
+    --no-header \
+    -o "${WORK_DIR}/resolved.txt" \
+    2>&1
+EXIT_CODE=$?
+set -e
+
+echo ""
+echo "=========================================="
+if [ $EXIT_CODE -eq 0 ]; then
+    echo "SUCCESS: Ray can generate lock files compatible with this vllm."
+    echo ""
+    echo "Key resolved versions:"
+    grep -E '^(protobuf|torch|numpy|transformers)==' \
+        "${WORK_DIR}/resolved.txt" | sort || true
+    echo "=========================================="
+    exit 0
+fi
+
+echo "FAILURE: Ray cannot generate lock files compatible with this vllm."
+echo "This means a fundamental dependency conflict exists that Ray"
+echo "cannot resolve by regenerating its lock files."
+echo "See: https://github.com/vllm-project/vllm/issues/33599"
+echo "=========================================="
+
+# Buildkite annotation
+if [ -f /usr/bin/buildkite-agent ]; then
+    buildkite-agent annotate --style 'warning' --context 'ray-compat' << EOF
+### :warning: Ray Dependency Compatibility Warning
+This PR introduces dependencies that **cannot** be resolved with Ray's requirements.
+Ray would not be able to regenerate its lock files to accommodate this vllm version.
+
+Please check the **Ray Dependency Compatibility Check** step logs for details.
+See [issue #33599](https://github.com/vllm-project/vllm/issues/33599) for context.
+EOF
+fi
+
+# Notify Slack if webhook is configured and PR/branch are valid.
+if [ -n "$RAY_COMPAT_SLACK_WEBHOOK_URL" ]; then
+    PR="${BUILDKITE_PULL_REQUEST:-}"
+    BRANCH="${BUILDKITE_BRANCH:-}"
+
+    # Skip notification if PR is invalid or branch is empty
+    if [[ "$PR" = "false" || -z "$PR" || -z "$BRANCH" ]]; then
+        echo ">>> Skipping Slack notification (invalid PR or empty branch: PR=$PR, branch=$BRANCH)"
+    else
+        echo ">>> Sending Slack notification"
+        # Single quotes are intentional: the f-string expressions are Python, not shell.
+        # shellcheck disable=SC2016
+        PAYLOAD=$(python3 -c '
+import json, os, sys
+pr = os.getenv("BUILDKITE_PULL_REQUEST", "N/A")
+branch = os.getenv("BUILDKITE_BRANCH", "unknown")
+url = os.getenv("BUILDKITE_BUILD_URL", "#")
+data = {
+    "text": ":warning: Ray Dependency Compatibility Check Failed",
+    "blocks": [{
+        "type": "section",
+        "text": {
+            "type": "mrkdwn",
+            "text": (
+                "*:warning: Ray Dependency Compatibility Check Failed*\n"
+                f"PR #{pr} on branch `{branch}` introduces dependencies "
+                f"that cannot be resolved with Ray'\''s requirements.\n"
+                f"<{url}|View Build>"
+            ),
+        },
+    }],
+}
+print(json.dumps(data))
+')
+
+        HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \
+            -H 'Content-type: application/json' \
+            -d "$PAYLOAD")
+        echo "    Slack webhook response: $HTTP_CODE"
+    fi
+else
+    echo ">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)"
+fi
+
+exit 1
--- a/.buildkite/scripts/cherry-pick-from-milestone.sh
+++ b/.buildkite/scripts/cherry-pick-from-milestone.sh
@@ -134,7 +134,7 @@ log_info "Fetching merged PRs from milestone '${MILESTONE}'..."

 # Store PR data in a temp file
 PR_DATA=$(mktemp)
-trap "rm -f $PR_DATA" EXIT
+trap 'rm -f "$PR_DATA"' EXIT

 if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
    --limit 1000 \

--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
--- a/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
+#!/bin/bash
+set -euox pipefail
+
+export VLLM_CPU_KVCACHE_SPACE=1 
+export VLLM_CPU_CI_ENV=1
+# Reduce sub-processes for acceleration
+export TORCH_COMPILE_DISABLE=1 
+export VLLM_ENABLE_V1_MULTIPROCESSING=0
+
+SDE_ARCHIVE="sde-external-10.7.0-2026-02-18-lin.tar.xz"
+SDE_CHECKSUM="CA3D4086DE4ACB3FAEDF9F57B541C6936B7D5E19AE2BF763B6EA933573A0A217"
+wget "https://downloadmirror.intel.com/913594/${SDE_ARCHIVE}"
+echo "${SDE_CHECKSUM}  ${SDE_ARCHIVE}" | sha256sum --check
+mkdir -p sde
+tar -xvf "./${SDE_ARCHIVE}" --strip-components=1 -C ./sde/
+
+wait_for_pid_and_check_log() {
+    local pid="$1"
+    local log_file="$2"
+    local exit_status
+
+    if [ -z "$pid" ] || [ -z "$log_file" ]; then
+        echo "Usage: wait_for_pid_and_check_log <PID> <LOG_FILE>"
+        return 1
+    fi
+
+    echo "Waiting for process $pid to finish..."
+    
+    # Use the 'wait' command to pause the script until the specific PID exits.
+    # The 'wait' command's own exit status will be that of the waited-for process.
+    if wait "$pid"; then
+        exit_status=$?
+        echo "Process $pid finished with exit status $exit_status (Success)."
+    else
+        exit_status=$?
+        echo "Process $pid finished with exit status $exit_status (Failure)."
+    fi
+
+    if [ "$exit_status" -ne 0 ]; then
+        echo "Process exited with a non-zero status."
+        echo "--- Last few lines of log file: $log_file ---"
+        tail -n 50 "$log_file"
+        echo "---------------------------------------------"
+        return 1 # Indicate failure based on exit status
+    fi
+
+    echo "No errors detected in log file and process exited successfully."
+    return 0
+}
+
+# Test Sky Lake (AVX512F)
+./sde/sde64 -skl -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_0.log 2>&1 &
+PID_TEST_0=$!
+
+# Test Cascade Lake (AVX512F + VNNI)
+./sde/sde64 -clx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_1.log 2>&1 &
+PID_TEST_1=$!
+
+# Test Cooper Lake (AVX512F + VNNI + BF16)
+./sde/sde64 -cpx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_2.log 2>&1 &
+PID_TEST_2=$!
+
+wait_for_pid_and_check_log $PID_TEST_0 test_0.log
+wait_for_pid_and_check_log $PID_TEST_1 test_1.log
+wait_for_pid_and_check_log $PID_TEST_2 test_2.log
--- a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
 #!/bin/bash
 set -euox pipefail
+export VLLM_CPU_CI_ENV=0

 echo "--- PP+TP"
 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
 server_pid=$!
-timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
 vllm bench serve \
    --backend vllm \
    --dataset-name random \
    --model meta-llama/Llama-3.2-3B-Instruct \
    --num-prompts 20 \
+    --result-dir ./test_results \
+    --result-filename tp_pp.json \
+    --save-result \
    --endpoint /v1/completions
-kill -s SIGTERM $server_pid &
+kill -s SIGTERM $server_pid; wait $server_pid || true
+failed_req=$(jq '.failed' ./test_results/tp_pp.json)
+if [ "$failed_req" -ne 0 ]; then
+  echo "Some requests were failed!"
+  exit 1
+fi

 echo "--- DP+TP"
 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
 server_pid=$!
-timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
 vllm bench serve \
    --backend vllm \
    --dataset-name random \
    --model meta-llama/Llama-3.2-3B-Instruct \
    --num-prompts 20 \
+    --result-dir ./test_results \
+    --result-filename dp_pp.json \
+    --save-result \
    --endpoint /v1/completions
-kill -s SIGTERM $server_pid &
+kill -s SIGTERM $server_pid; wait $server_pid || true
+failed_req=$(jq '.failed' ./test_results/dp_pp.json)
+if [ "$failed_req" -ne 0 ]; then
+  echo "Some requests were failed!"
+  exit 1
+fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -34,7 +34,7 @@ function cpu_tests() {
  # offline inference
  docker exec cpu-test bash -c "
    set -e
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m"

  # Run model tests
  docker exec cpu-test bash -c "

--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -27,7 +27,7 @@ function cpu_tests() {
  podman exec -it "$container_id" bash -c "
    export TORCH_COMPILE_DISABLE=1
    set -xve
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log

  # Run basic model test
  podman exec -it "$container_id" bash -c "
@@ -43,7 +43,7 @@ function cpu_tests() {
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
    # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
-    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
+    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> "$HOME"/test_rest.log
 }

 # All of CPU tests are expected to be finished less than 40 mins.

--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -16,5 +16,5 @@ echo "--- :docker: Building Docker image"
 docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .

 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run --rm --cpuset-cpus=$CORE_RANGE --cpuset-mems=$NUMA_NODE -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g $IMAGE_NAME \
-        timeout $TIMEOUT_VAL bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
+docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \
+        timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@@ -25,5 +25,5 @@ remove_docker_container

 # Run the image and test offline inference
 docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
+    python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B
 '
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
 #!/bin/bash

-# This script build the CPU docker image and run the offline inference inside the container.
+# This script builds the HPU docker image and runs the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
+#
+# vllm-gaudi compatibility pinning:
+#   The vllm-gaudi plugin is installed on top of the vllm upstream checkout used by this CI job.
+#   When upstream vllm changes its API, the plugin may break before it has been updated.
+#   To handle this, the vllm-gaudi repository maintains a file:
+#     vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT
+#   The first line of that file controls what version of vllm is used inside the Docker image:
+#     - "latest"        : no checkout override; the current Buildkite CI commit is used as-is.
+#     - "<commit SHA>"  : vllm is checked out to that specific commit before building, pinning
+#                         the test to a known-compatible baseline.
+#   To unpin (resume testing against the live vllm tip), set the file content back to "latest".
 set -exuo pipefail

+# Fetch the vllm community commit reference from vllm-gaudi (first line only).
+VLLM_COMMUNITY_COMMIT=$(curl -s \
+  https://raw.githubusercontent.com/vllm-project/vllm-gaudi/vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT \
+  | head -1 | tr -d '\n')
+
+echo "Using vllm community commit: ${VLLM_COMMUNITY_COMMIT}"
+
 # Try building the docker image
 image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
 container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
-cat <<EOF | docker build -t ${image_name} -f - .
+cat <<EOF | docker build -t "${image_name}" -f - .
 FROM gaudi-base-image:latest

 COPY ./ /workspace/vllm

+# If VLLM_COMMUNITY_COMMIT is a specific commit (not "latest"), check it out to pin vllm
+# to the version known to be compatible with vllm-gaudi. When the value is "latest",
+# the current checkout (the Buildkite CI commit) is used unchanged.
+RUN if [ "${VLLM_COMMUNITY_COMMIT}" != "latest" ]; then \
+      cd /workspace/vllm && git fetch --unshallow 2>/dev/null || true && git checkout ${VLLM_COMMUNITY_COMMIT}; \
+    fi
+
 WORKDIR /workspace/vllm

 ENV no_proxy=localhost,127.0.0.1
@@ -39,19 +64,19 @@ EOF
 # functions, while other platforms only need one remove_docker_container
 # function.
 EXITCODE=1
-remove_docker_containers() { docker rm -f ${container_name} || true; }
+remove_docker_containers() { docker rm -f "${container_name}" || true; }
 trap 'remove_docker_containers; exit $EXITCODE;' EXIT
 remove_docker_containers

 echo "Running HPU plugin v1 test"
-docker run --rm --runtime=habana --name=${container_name} --network=host \
+docker run --rm --runtime=habana --name="${container_name}" --network=host \
  -e HABANA_VISIBLE_DEVICES=all \
  -e VLLM_SKIP_WARMUP=true \
  -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
  -e PT_HPU_LAZY_MODE=1 \
  "${image_name}" \
  /bin/bash -c '
-  cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
+  cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m
 '

 EXITCODE=$?

--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@@ -41,6 +41,7 @@ get_config() {
        echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
        exit 1
    fi
+    # shellcheck source=/dev/null
    source "${TEST_RUN_CONFIG_FILE}"
    echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
    return 0
@@ -48,9 +49,8 @@ get_config() {

 # get test running configuration.
 fetch_vllm_test_cfg
-get_config
 # Check if the function call was successful. If not, exit the script.
-if [ $? -ne 0 ]; then
+if ! get_config; then
  exit 1
 fi

@@ -62,14 +62,14 @@ agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
 echo "agent_idx: ${agent_idx}"
 builder_name="cachebuilder${agent_idx}"
 builder_cache_dir="/mnt/docker-cache${agent_idx}"
-mkdir -p ${builder_cache_dir}
+mkdir -p "${builder_cache_dir}"

 # Try building the docker image
 cat <<EOF | DOCKER_BUILDKIT=1 docker build \
-    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
-    --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
-                           --cache-to type=local,dest=${builder_cache_dir},mode=max \
-    --progress=plain --load -t ${image_name} -f - .
+    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:"${PYPI_CACHE_HOST}" \
+    --builder "${builder_name}" --cache-from type=local,src="${builder_cache_dir}" \
+                           --cache-to type=local,dest="${builder_cache_dir}",mode=max \
+    --progress=plain --load -t "${image_name}" -f - .
 FROM ${BASE_IMAGE_NAME}

 # Define environments
@@ -116,7 +116,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
-    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/devlib && \
    python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/

 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -139,7 +139,7 @@ trap remove_docker_container EXIT
 # Generate corresponding --device args based on BUILDKITE_AGENT_NAME
 # Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
 #   e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
-#   returns --device /dev/davinci0 --device /dev/davinci1
+#   returns one argument per line: --device, /dev/davinciX, ...
 parse_and_gen_devices() {
    local input="$1"
    local index cards_num
@@ -151,29 +151,24 @@ parse_and_gen_devices() {
        return 1
    fi

-    local devices=""
    local i=0
    while (( i < cards_num )); do
        local dev_idx=$(((index - 1)*cards_num + i ))
-        devices="$devices --device /dev/davinci${dev_idx}"
+        printf '%s\n' "--device"
+        printf '%s\n' "/dev/davinci${dev_idx}"
        ((i++))
    done
-
-    # trim leading space
-    devices="${devices#"${devices%%[![:space:]]*}"}"
-    # Output devices: assigned to the caller variable
-    printf '%s' "$devices"
 }

-devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
+mapfile -t device_args < <(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1

 # Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
 # This test checks whether the OOT platform interface is functioning properly in conjunction with
 # the hardware plugin vllm-ascend.
 model_cache_dir=/mnt/modelscope${agent_idx}
-mkdir -p ${model_cache_dir}
+mkdir -p "${model_cache_dir}"
 docker run \
-    ${devices} \
+    "${device_args[@]}" \
    --device /dev/davinci_manager \
    --device /dev/devmm_svm \
    --device /dev/hisi_hdc \
@@ -182,7 +177,7 @@ docker run \
    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
    -v /etc/ascend_install.info:/etc/ascend_install.info \
-    -v ${model_cache_dir}:/root/.cache/modelscope \
+    -v "${model_cache_dir}":/root/.cache/modelscope \
    --entrypoint="" \
    --name "${container_name}" \
    "${image_name}" \

--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"


--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"


--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"

 # Try building the docker image
-docker build -t ${image_name} -f docker/Dockerfile.xpu .
+docker build -t "${image_name}" -f docker/Dockerfile.xpu .

 # Setup cleanup
 remove_docker_container() {
@@ -34,17 +34,17 @@ docker run \
    set -e
    echo $ZE_AFFINITY_MASK
    pip install tblib==3.1.0
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
-    python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
-    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
+    python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
+    python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
+    python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
    cd tests
-    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
+    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
    pytest -v -s v1/engine
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py