Merge tag 'v0.18.0' into v0.18.0-ori

3fb4b5fa · zhuwenwen · bcf25339 · 89138b21 · 3fb4b5fa · 3fb4b5fa
Commit 3fb4b5fa authored Mar 23, 2026 by zhuwenwen
20 changed files
--- a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
@@ -10,7 +10,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -37,7 +36,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -64,7 +62,6 @@
        "server_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy",
            "max-model-len": 2048,
@@ -78,5 +75,83 @@
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
+    },
+    {
+        "test_name": "serving_deepseek_r1",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "server_parameters": {
+            "model": "deepseek-ai/DeepSeek-R1",
+            "tensor_parallel_size": 8,
+            "disable_log_stats": "",
+            "load_format": "dummy",
+            "max-model-len": 2048,
+            "max-num-seqs": 200,
+            "async-scheduling": "",
+            "dtype": "bfloat16"
+        },
+        "client_parameters": {
+            "model": "deepseek-ai/DeepSeek-R1",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama4_maverick_17b128e_instruct_fp8",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+            "tensor_parallel_size": 8,
+            "disable_log_stats": "",
+            "max-model-len": 2048,
+            "max-num-seqs": 128,
+            "async-scheduling": "",
+            "enable_expert_parallel": "",
+            "max-num-batched-tokens": 4096
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_qwen3_8b",
+        "qps_list": [1, 4, 10, "inf"],
+        "server_environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "server_parameters": {
+            "model": "Qwen/Qwen-3-8B",
+            "tensor_parallel_size": 1,
+            "dtype": "bfloat16",
+            "disable_log_stats": "",
+            "async-scheduling": ""
+        },
+        "client_parameters": {
+            "model": "Qwen/Qwen-3-8B",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
    }
 ]
--- a/.buildkite/performance-benchmarks/tests/serving-tests.json
+++ b/.buildkite/performance-benchmarks/tests/serving-tests.json
@@ -5,7 +5,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -23,7 +22,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "tensor_parallel_size": 4,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -41,7 +39,6 @@
        "server_parameters": {
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "tensor_parallel_size": 2,
-            "swap_space": 16,
            "disable_log_stats": "",
            "load_format": "dummy"
        },
@@ -59,7 +56,6 @@
        "server_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", 
            "tensor_parallel_size": 4,
-            "swap_space": 16,
            "speculative_config": {
                "model": "turboderp/Qwama-0.5B-Instruct",
                "num_speculative_tokens": 4,

--- a/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
+++ b/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
@@ -57,5 +57,67 @@
            "max-num-seqs": 512,
            "async-scheduling": ""
        }
+    },
+    {
+        "test_name": "throughput_deepseek_r1",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "deepseek-ai/DeepSeek-R1",
+            "tensor_parallel_size": 8,
+            "load_format": "dummy",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "dataset_name": "sharegpt",
+            "num_prompts": 1000,
+            "backend": "vllm",
+            "max-model-len": 2048,
+            "max-num-seqs": 384,
+            "async-scheduling": ""
+        }
+    },
+    {
+        "test_name": "throughput_llama4_maverick_17b128e_instruct_fp8",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+            "tensor_parallel_size": 8,
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "dataset_name": "sharegpt",
+            "num_prompts": 1000,
+            "backend": "vllm",
+            "max-model-len": 2048,
+            "max-num-seqs": 512,
+            "async-scheduling": "",
+            "enable_expert_parallel": ""
+        }
+    },
+    {
+        "test_name": "throughput_qwen3_8b",
+        "environment_variables": {
+            "PT_HPU_LAZY_MODE": 1,
+            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
+            "VLLM_CONTIGUOUS_PA": 1,
+            "VLLM_DEFRAG": 1
+        },
+        "parameters": {
+            "model": "Qwen/Qwen-3-8B",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "dataset_name": "sharegpt",
+            "num_prompts": 1000,
+            "max-num-seqs": 512,
+            "backend": "vllm",
+            "async-scheduling": ""
+        }
    }
 ]
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -83,7 +83,7 @@ steps:
        agents:
          queue: cpu_queue_postmerge
        commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
          - "mkdir artifacts"
          - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
          - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
@@ -152,7 +152,7 @@ steps:
          queue: cpu_queue_postmerge
        commands:
          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
          - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
        env:

--- a/.buildkite/scripts/annotate-rocm-release.sh
+++ b/.buildkite/scripts/annotate-rocm-release.sh
@@ -25,7 +25,7 @@ S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
 S3_URL="http://${S3_BUCKET}.s3-website-${S3_REGION}.amazonaws.com"

 # Format ROCm version for path (e.g., "7.1" -> "rocm710")
-ROCM_VERSION_PATH="rocm$(echo ${ROCM_VERSION} | tr -d '.')"
+ROCM_VERSION_PATH="rocm$(echo "${ROCM_VERSION}" | tr -d '.')"
 ROCM_PATH="rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}"
 buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
 ## ROCm Wheel and Docker Image Releases
@@ -68,7 +68,7 @@ aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/triton
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchvision-*.whl .
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/torchaudio-*.whl .
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amdsmi-*.whl .
-aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/aiter-*.whl .
+aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/amd_aiter-*.whl .
 aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-attn-*.whl .
 \`\`\`

@@ -80,7 +80,7 @@ aws s3 cp s3://${S3_BUCKET}/rocm/${BUILDKITE_COMMIT}/${ROCM_VERSION_PATH}/flash-
 - **torchvision**: TorchVision for ROCm PyTorch
 - **torchaudio**: Torchaudio for ROCm PyTorch
 - **amdsmi**: AMD SMI Python bindings
- **aiter**: Aiter for ROCm
+- **amd_aiter**: Aiter for ROCm
 - **flash-attn**: Flash Attention for ROCm

 ### :warning: Notes

--- a/.buildkite/scripts/cache-rocm-base-wheels.sh
+++ b/.buildkite/scripts/cache-rocm-base-wheels.sh
@@ -83,7 +83,7 @@ case "${1:-}" in
            exit 1
        fi

-        WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
+        WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
        if [[ "$WHEEL_COUNT" -eq 0 ]]; then
            echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
            exit 1
@@ -110,9 +110,9 @@ case "${1:-}" in

        echo ""
        echo "Downloaded wheels:"
-        ls -lh artifacts/rocm-base-wheels/
+        find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' -exec ls -lh {} \;

-        WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
+        WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
        echo ""
        echo "Total: $WHEEL_COUNT wheels"
        echo "========================================"

--- a/.buildkite/scripts/check-ray-compatibility.sh
+++ b/.buildkite/scripts/check-ray-compatibility.sh
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Check if Ray LLM can generate lock files that are compatible with this
+# version of vllm. Downloads Ray's requirement files and runs a full
+# dependency resolution with the installed vllm's constraints to see if
+# a valid lock file can be produced.
+#
+# See: https://github.com/vllm-project/vllm/issues/33599
+
+set -eo pipefail
+
+RAY_BASE_URL="https://raw.githubusercontent.com/ray-project/ray/master/python"
+
+WORK_DIR=$(mktemp -d)
+trap 'rm -rf "$WORK_DIR"' EXIT
+
+# Fetch all Ray requirement files used in the LLM depset pipeline
+echo ">>> Fetching Ray requirement files"
+RAY_FILES=(
+    "requirements.txt"
+    "requirements/cloud-requirements.txt"
+    "requirements/base-test-requirements.txt"
+    "requirements/llm/llm-requirements.txt"
+    "requirements/llm/llm-test-requirements.txt"
+)
+for FILE in "${RAY_FILES[@]}"; do
+    LOCAL_PATH="${WORK_DIR}/$(basename "$FILE")"
+    echo "    ${FILE}"
+    curl -fsSL -o "$LOCAL_PATH" "${RAY_BASE_URL}/${FILE}"
+done
+
+# Extract installed vllm deps
+echo ">>> Extracting installed vllm dependency constraints"
+python3 - "${WORK_DIR}/vllm-constraints.txt" <<'PYEOF'
+"""Write out the installed vllm's dependencies as pip constraint lines.
+
+Ray uses vllm[audio], so audio-extra deps are included with their extra
+markers stripped. The resolver cannot evaluate extra markers for a
+package that is not itself being resolved from an index, so we activate
+them manually here.
+"""
+import importlib.metadata
+import re
+import sys
+
+out_path = sys.argv[1]
+raw_reqs = importlib.metadata.requires("vllm") or []
+
+# Ray uses vllm[audio] – activate that extra.
+ACTIVE_EXTRAS = {"audio"}
+EXTRA_RE = re.compile(r"""extra\s*==\s*['"]([^'"]+)['"]""")
+
+lines = []
+for r in raw_reqs:
+    if ";" not in r:
+        # Unconditional dep — always include.
+        lines.append(r.strip())
+        continue
+
+    req_part, _, marker_part = r.partition(";")
+    marker_part = marker_part.strip()
+
+    extra_matches = EXTRA_RE.findall(marker_part)
+    if not extra_matches:
+        # Non-extra marker (python_version, etc.) — keep as-is.
+        lines.append(r.strip())
+        continue
+
+    if not ACTIVE_EXTRAS.intersection(extra_matches):
+        continue  # Skip inactive extras (tensorizer, bench, …).
+
+    # Strip the extra== conditions but keep any remaining markers
+    # (e.g. python_version).
+    cleaned = EXTRA_RE.sub("", marker_part)
+    cleaned = re.sub(r"\band\b\s*\band\b", "and", cleaned)
+    cleaned = re.sub(r"^\s*and\s+|\s+and\s*$", "", cleaned).strip()
+
+    if cleaned:
+        lines.append(f"{req_part.strip()} ; {cleaned}")
+    else:
+        lines.append(req_part.strip())
+
+with open(out_path, "w") as f:
+    for line in lines:
+        f.write(line + "\n")
+
+print(f"Wrote {len(lines)} constraints to {out_path}")
+PYEOF
+
+echo ">>> Installed vllm deps (first 20 lines):"
+head -20 "${WORK_DIR}/vllm-constraints.txt"
+
+# Remove Ray's vllm pin — the installed vllm's transitive deps
+# (written above) replace it in the resolution. vllm itself cannot
+# be resolved from PyPI for in-development versions, so we test
+# whether Ray's requirements can coexist with vllm's dependency
+# constraints instead.
+sed -i '/^vllm/d' "${WORK_DIR}/llm-requirements.txt"
+
+# Install uv if needed
+if ! command -v uv &>/dev/null; then
+    echo ">>> Installing uv"
+    pip install uv -q
+fi
+
+# Resolve: given vllm's constraints, can Ray compile a lock file?
+#
+# vllm's dependency constraints are the fixed side — Ray is flexible and
+# can regenerate its lock files. We pass vllm's constraints via -c so
+# the resolver treats them as non-negotiable bounds, then check whether
+# Ray's own requirements can still be satisfied within those bounds.
+echo ""
+echo "============================================================"
+echo ">>> Resolving: Can Ray generate compatible lock files?"
+echo "============================================================"
+
+set +e
+uv pip compile \
+    "${WORK_DIR}/requirements.txt" \
+    "${WORK_DIR}/cloud-requirements.txt" \
+    "${WORK_DIR}/base-test-requirements.txt" \
+    "${WORK_DIR}/llm-requirements.txt" \
+    "${WORK_DIR}/llm-test-requirements.txt" \
+    -c "${WORK_DIR}/vllm-constraints.txt" \
+    --python-version 3.12 \
+    --python-platform x86_64-manylinux_2_31 \
+    --extra-index-url https://download.pytorch.org/whl/cu129 \
+    --index-strategy unsafe-best-match \
+    --unsafe-package setuptools \
+    --unsafe-package ray \
+    --no-header \
+    -o "${WORK_DIR}/resolved.txt" \
+    2>&1
+EXIT_CODE=$?
+set -e
+
+echo ""
+echo "=========================================="
+if [ $EXIT_CODE -eq 0 ]; then
+    echo "SUCCESS: Ray can generate lock files compatible with this vllm."
+    echo ""
+    echo "Key resolved versions:"
+    grep -E '^(protobuf|torch|numpy|transformers)==' \
+        "${WORK_DIR}/resolved.txt" | sort || true
+    echo "=========================================="
+    exit 0
+fi
+
+echo "FAILURE: Ray cannot generate lock files compatible with this vllm."
+echo "This means a fundamental dependency conflict exists that Ray"
+echo "cannot resolve by regenerating its lock files."
+echo "See: https://github.com/vllm-project/vllm/issues/33599"
+echo "=========================================="
+
+# Buildkite annotation
+if [ -f /usr/bin/buildkite-agent ]; then
+    buildkite-agent annotate --style 'warning' --context 'ray-compat' << EOF
+### :warning: Ray Dependency Compatibility Warning
+This PR introduces dependencies that **cannot** be resolved with Ray's requirements.
+Ray would not be able to regenerate its lock files to accommodate this vllm version.
+
+Please check the **Ray Dependency Compatibility Check** step logs for details.
+See [issue #33599](https://github.com/vllm-project/vllm/issues/33599) for context.
+EOF
+fi
+
+# Notify Slack if webhook is configured and PR/branch are valid.
+if [ -n "$RAY_COMPAT_SLACK_WEBHOOK_URL" ]; then
+    PR="${BUILDKITE_PULL_REQUEST:-}"
+    BRANCH="${BUILDKITE_BRANCH:-}"
+
+    # Skip notification if PR is invalid or branch is empty
+    if [[ "$PR" = "false" || -z "$PR" || -z "$BRANCH" ]]; then
+        echo ">>> Skipping Slack notification (invalid PR or empty branch: PR=$PR, branch=$BRANCH)"
+    else
+        echo ">>> Sending Slack notification"
+        # Single quotes are intentional: the f-string expressions are Python, not shell.
+        # shellcheck disable=SC2016
+        PAYLOAD=$(python3 -c '
+import json, os, sys
+pr = os.getenv("BUILDKITE_PULL_REQUEST", "N/A")
+branch = os.getenv("BUILDKITE_BRANCH", "unknown")
+url = os.getenv("BUILDKITE_BUILD_URL", "#")
+data = {
+    "text": ":warning: Ray Dependency Compatibility Check Failed",
+    "blocks": [{
+        "type": "section",
+        "text": {
+            "type": "mrkdwn",
+            "text": (
+                "*:warning: Ray Dependency Compatibility Check Failed*\n"
+                f"PR #{pr} on branch `{branch}` introduces dependencies "
+                f"that cannot be resolved with Ray'\''s requirements.\n"
+                f"<{url}|View Build>"
+            ),
+        },
+    }],
+}
+print(json.dumps(data))
+')
+
+        HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RAY_COMPAT_SLACK_WEBHOOK_URL" \
+            -H 'Content-type: application/json' \
+            -d "$PAYLOAD")
+        echo "    Slack webhook response: $HTTP_CODE"
+    fi
+else
+    echo ">>> Skipping Slack notification (RAY_COMPAT_SLACK_WEBHOOK_URL not set)"
+fi
+
+exit 1
--- a/.buildkite/scripts/cherry-pick-from-milestone.sh
+++ b/.buildkite/scripts/cherry-pick-from-milestone.sh
@@ -134,7 +134,7 @@ log_info "Fetching merged PRs from milestone '${MILESTONE}'..."

 # Store PR data in a temp file
 PR_DATA=$(mktemp)
-trap "rm -f $PR_DATA" EXIT
+trap 'rm -f "$PR_DATA"' EXIT

 if ! gh pr list --state merged --search "milestone:${MILESTONE}" \
    --limit 1000 \

--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
 #!/bin/bash

-# This script runs test inside the corresponding ROCm docker container.
+# This script runs tests inside the corresponding ROCm docker container.
+# It handles both single-node and multi-node test configurations.
+#
+# Multi-node detection: Instead of matching on fragile group names, we detect
+# multi-node jobs structurally by looking for the bracket command syntax
+# "[node0_cmds] && [node1_cmds]" or via the NUM_NODES environment variable.
+#
+###############################################################################
+# QUOTING / COMMAND PASSING
+#
+# Passing commands as positional arguments ($*) is fragile when the command
+# string itself contains double quotes, e.g.:
+#
+#   bash run-amd-test.sh "export FLAGS="value" && pytest -m "not slow""
+#
+# The outer shell resolves the nested quotes *before* this script runs, so
+# the script receives mangled input it cannot fully recover.
+#
+# Preferred: pass commands via the VLLM_TEST_COMMANDS environment variable:
+#
+#   export VLLM_TEST_COMMANDS='export FLAGS="value" && pytest -m "not slow"'
+#   bash run-amd-test.sh
+#
+# Single-quoted assignment preserves all inner double quotes verbatim.
+# The $* path is kept for backward compatibility but callers should migrate.
+###############################################################################
 set -o pipefail

 # Export Python path
 export PYTHONPATH=".."

-# Print ROCm version
-echo "--- Confirming Clean Initial State"
-while true; do
-        sleep 3
-        if grep -q clean /opt/amdgpu/etc/gpu_state; then
-                echo "GPUs state is \"clean\""
-                break
-        fi
-done
-
-echo "--- ROCm info"
-rocminfo
+###############################################################################
+# Helper Functions
+###############################################################################
+
+wait_for_clean_gpus() {
+  local timeout=${1:-300}
+  local start=$SECONDS
+  echo "--- Waiting for clean GPU state (timeout: ${timeout}s)"
+  while true; do
+    if grep -q clean /opt/amdgpu/etc/gpu_state; then
+      echo "GPUs state is \"clean\""
+      return
+    fi
+    if (( SECONDS - start >= timeout )); then
+      echo "Error: GPUs did not reach clean state within ${timeout}s" >&2
+      exit 1
+    fi
+    sleep 3
+  done
+}

-# cleanup older docker images
 cleanup_docker() {
  # Get Docker's root directory
  docker_root=$(docker info -f '{{.DockerRootDir}}')
@@ -28,15 +60,12 @@ cleanup_docker() {
    exit 1
  fi
  echo "Docker root directory: $docker_root"
-  # Check disk usage of the filesystem where Docker's root directory is located
+
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
-  # Define the threshold
  threshold=70
  if [ "$disk_usage" -gt "$threshold" ]; then
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
-    # Remove dangling images (those that are not tagged and not used by any container)
    docker image prune -f
-    # Remove unused volumes / force the system prune for old images as well.
    docker volume prune -f && docker system prune --force --filter "until=72h" --all
    echo "Docker images and volumes cleanup completed."
  else
@@ -45,193 +74,445 @@ cleanup_docker() {
 }

 cleanup_network() {
-  for node in $(seq 0 $((NUM_NODES-1))); do
-    if docker pr -a -q -f name="node${node}" | grep -q .; then
-      docker stop "node${node}"
+  local max_nodes=${NUM_NODES:-2}
+  for node in $(seq 0 $((max_nodes - 1))); do
+    if docker ps -a -q -f name="node${node}" | grep -q .; then
+      docker stop "node${node}" || true
    fi
  done
-  if docker network ls | grep docker-net; then
-    docker network rm docker-net
+  if docker network ls | grep -q docker-net; then
+    docker network rm docker-net || true
+  fi
+}
+
+is_multi_node() {
+  local cmds="$1"
+  # Primary signal: NUM_NODES environment variable set by the pipeline
+  if [[ "${NUM_NODES:-1}" -gt 1 ]]; then
+    return 0
+  fi
+  # Fallback: detect the bracket syntax structurally
+  # Pattern: [...] && [...] (per-node command arrays)
+  if [[ "$cmds" =~ \[.*\].*\&\&.*\[.*\] ]]; then
+    return 0
+  fi
+  return 1
+}
+
+handle_pytest_exit() {
+  local exit_code=$1
+  if [ "$exit_code" -eq 5 ]; then
+    echo "Pytest exit code 5 (no tests collected) - treating as success."
+    exit 0
  fi
+  exit "$exit_code"
 }

-# Call the cleanup docker function
+###############################################################################
+# Pytest marker/keyword re-quoting
+#
+# When commands are passed through Buildkite -> shell -> $* -> bash -c,
+# quotes around multi-word pytest -m/-k expressions get stripped:
+#   pytest -v -s -m 'not cpu_test' v1/core
+# becomes:
+#   pytest -v -s -m not cpu_test v1/core
+#
+# pytest then interprets "cpu_test" as a file path, not part of the marker.
+#
+# This function detects unquoted expressions after -m/-k and re-quotes them
+# by collecting tokens until a recognizable boundary is reached:
+#   - test path (contains '/')
+#   - test file (ends with '.py')
+#   - another pytest flag (--xxx or -x single-char flags)
+#   - command separator (&& || ; |)
+#   - environment variable assignment (FOO=bar)
+#
+# Single-word markers (e.g. -m cpu_test, -m hybrid_model) pass through
+# unquoted since they have no spaces and work fine.
+#
+# Already-quoted expressions (containing literal single quotes) are passed
+# through untouched to avoid double-quoting values injected by
+# apply_rocm_test_overrides.
+#
+# NOTE: This ONLY fixes -m/-k flags. It cannot recover arbitrary inner
+# double-quotes stripped by the calling shell (see header comment).
+# Use VLLM_TEST_COMMANDS to avoid the problem entirely.
+###############################################################################
+re_quote_pytest_markers() {
+  local input="$1"
+  local output=""
+  local collecting=false
+  local marker_buf=""
+
+  # Strip backslash-newline continuations, then flatten remaining newlines
+  local flat="${input//$'\\\n'/ }"
+  flat="${flat//$'\n'/ }"
+
+  # Disable globbing to prevent *.py etc. from expanding during read -ra
+  local restore_glob
+  restore_glob="$(shopt -p -o noglob 2>/dev/null || true)"
+  set -o noglob
+  local -a words
+  read -ra words <<< "$flat"
+  eval "$restore_glob"
+
+  for word in "${words[@]}"; do
+    if $collecting; then
+      # If the token we're about to collect already contains a literal
+      # single quote, the expression was already quoted upstream.
+      # Flush and stop collecting.
+      if [[ "$word" == *"'"* ]]; then
+        if [[ -n "$marker_buf" ]]; then
+          # Should not normally happen (partial buf + quote), flush raw
+          output+="${marker_buf} "
+          marker_buf=""
+        fi
+        output+="${word} "
+        collecting=false
+        continue
+      fi
+
+      local is_boundary=false
+      case "$word" in
+        # Line-continuation artifact
+        "\\")
+          is_boundary=true ;;
+        # Command separators
+        "&&"|"||"|";"|"|")
+          is_boundary=true ;;
+        # Long flags (--ignore, --shard-id, etc.)
+        --*)
+          is_boundary=true ;;
+        # Short flags (-v, -s, -x, etc.) but NOT negative marker tokens
+        # like "not" which don't start with "-". Also skip -k/-m which
+        # would start a new marker (handled below).
+        -[a-zA-Z])
+          is_boundary=true ;;
+        # Test path (contains /)
+        */*)
+          is_boundary=true ;;
+        # Test file (ends with .py, possibly with ::method)
+        *.py|*.py::*)
+          is_boundary=true ;;
+        # Environment variable assignment preceding a command (FOO=bar)
+        *=*)
+          # Only treat as boundary if it looks like VAR=value, not
+          # pytest filter expressions like num_gpus=2 inside markers
+          if [[ "$word" =~ ^[A-Z_][A-Z0-9_]*= ]]; then
+            is_boundary=true
+          fi
+          ;;
+      esac
+
+      if $is_boundary; then
+        # Strip surrounding double quotes if present (from upstream
+        # single-to-double conversion); without this, wrapping below
+        # would produce '"expr"' with literal double-quote characters.
+        if [[ "$marker_buf" == '"'*'"' ]]; then
+          marker_buf="${marker_buf#\"}"
+          marker_buf="${marker_buf%\"}"
+        fi
+        # Flush the collected marker expression
+        if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
+          output+="'${marker_buf}' "
+        else
+          output+="${marker_buf} "
+        fi
+        collecting=false
+        marker_buf=""
+        # Check if this boundary word itself starts a new -m/-k
+        if [[ "$word" == "-m" || "$word" == "-k" ]]; then
+          output+="${word} "
+          collecting=true
+        # Drop stray backslash tokens silently
+        elif [[ "$word" == "\\" ]]; then
+          :
+        else
+          output+="${word} "
+        fi
+      else
+        # Accumulate into marker buffer
+        if [[ -n "$marker_buf" ]]; then
+          marker_buf+=" ${word}"
+        else
+          marker_buf="${word}"
+        fi
+      fi
+    elif [[ "$word" == "-m" || "$word" == "-k" ]]; then
+      output+="${word} "
+      collecting=true
+      marker_buf=""
+    else
+      output+="${word} "
+    fi
+  done
+
+  # Flush any trailing marker expression (marker at end of command)
+  if $collecting && [[ -n "$marker_buf" ]]; then
+    # Strip surrounding double quotes (see mid-stream flush comment)
+    if [[ "$marker_buf" == '"'*'"' ]]; then
+      marker_buf="${marker_buf#\"}"
+      marker_buf="${marker_buf%\"}"
+    fi
+    if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
+      output+="'${marker_buf}'"
+    else
+      output+="${marker_buf}"
+    fi
+  fi
+
+  echo "${output% }"
+}
+
+###############################################################################
+# ROCm-specific pytest command rewrites
+#
+# These apply ignore flags and environment overrides for tests that are not
+# yet supported or behave differently on ROCm hardware. Kept as a single
+# function so new exclusions are easy to add in one place.
+###############################################################################
+
+apply_rocm_test_overrides() {
+  local cmds="$1"
+
+  # --- Model registry filter ---
+  if [[ $cmds == *"pytest -v -s models/test_registry.py"* ]]; then
+    cmds=${cmds//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
+  fi
+
+  # --- LoRA: disable custom paged attention ---
+  if [[ $cmds == *"pytest -v -s lora"* ]]; then
+    cmds=${cmds//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
+  fi
+
+  # --- Kernel ignores ---
+  if [[ $cmds == *" kernels/core"* ]]; then
+    cmds="${cmds} \
+    --ignore=kernels/core/test_fused_quant_layernorm.py \
+    --ignore=kernels/core/test_permute_cols.py"
+  fi
+
+  if [[ $cmds == *" kernels/attention"* ]]; then
+    cmds="${cmds} \
+    --ignore=kernels/attention/test_attention_selector.py \
+    --ignore=kernels/attention/test_encoder_decoder_attn.py \
+    --ignore=kernels/attention/test_flash_attn.py \
+    --ignore=kernels/attention/test_flashinfer.py \
+    --ignore=kernels/attention/test_prefix_prefill.py \
+    --ignore=kernels/attention/test_cascade_flash_attn.py \
+    --ignore=kernels/attention/test_mha_attn.py \
+    --ignore=kernels/attention/test_lightning_attn.py \
+    --ignore=kernels/attention/test_attention.py"
+  fi
+
+  if [[ $cmds == *" kernels/quantization"* ]]; then
+    cmds="${cmds} \
+    --ignore=kernels/quantization/test_int8_quant.py \
+    --ignore=kernels/quantization/test_machete_mm.py \
+    --ignore=kernels/quantization/test_block_fp8.py \
+    --ignore=kernels/quantization/test_block_int8.py \
+    --ignore=kernels/quantization/test_marlin_gemm.py \
+    --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
+    --ignore=kernels/quantization/test_int8_kernel.py"
+  fi
+
+  if [[ $cmds == *" kernels/mamba"* ]]; then
+    cmds="${cmds} \
+    --ignore=kernels/mamba/test_mamba_mixer2.py \
+    --ignore=kernels/mamba/test_causal_conv1d.py \
+    --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
+  fi
+
+  if [[ $cmds == *" kernels/moe"* ]]; then
+    cmds="${cmds} \
+    --ignore=kernels/moe/test_moe.py \
+    --ignore=kernels/moe/test_cutlass_moe.py \
+    --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
+  fi
+
+  # --- Entrypoint ignores ---
+  if [[ $cmds == *" entrypoints/openai "* ]]; then
+    cmds=${cmds//" entrypoints/openai "/" entrypoints/openai \
+    --ignore=entrypoints/openai/test_audio.py \
+    --ignore=entrypoints/openai/test_shutdown.py \
+    --ignore=entrypoints/openai/test_completion.py \
+    --ignore=entrypoints/openai/test_models.py \
+    --ignore=entrypoints/openai/test_lora_adapters.py \
+    --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
+    --ignore=entrypoints/openai/test_root_path.py \
+    --ignore=entrypoints/openai/test_tokenization.py \
+    --ignore=entrypoints/openai/test_prompt_validation.py "}
+  fi
+
+  if [[ $cmds == *" entrypoints/llm "* ]]; then
+    cmds=${cmds//" entrypoints/llm "/" entrypoints/llm \
+    --ignore=entrypoints/llm/test_chat.py \
+    --ignore=entrypoints/llm/test_accuracy.py \
+    --ignore=entrypoints/llm/test_init.py \
+    --ignore=entrypoints/llm/test_prompt_validation.py "}
+  fi
+
+  # Clean up escaped newlines from --ignore appends
+  cmds=$(echo "$cmds" | sed 's/ \\ / /g')
+
+  echo "$cmds"
+}
+
+###############################################################################
+# Main
+###############################################################################
+
+# --- GPU initialization ---
+echo "--- Confirming Clean Initial State"
+wait_for_clean_gpus
+
+echo "--- ROCm info"
+rocminfo
+
+# --- Docker housekeeping ---
 cleanup_docker

 echo "--- Resetting GPUs"
-
 echo "reset" > /opt/amdgpu/etc/gpu_state
+wait_for_clean_gpus

-while true; do
-        sleep 3
-        if grep -q clean /opt/amdgpu/etc/gpu_state; then
-                echo "GPUs state is \"clean\""
-                break
-        fi
-done
-
+# --- Pull test image ---
 echo "--- Pulling container"
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull "${image_name}"

 remove_docker_container() {
-   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
+  docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
 }
 trap remove_docker_container EXIT

+# --- Prepare commands ---
 echo "--- Running container"

 HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"

-commands=$@
-echo "Raw commands: $commands"
-
-commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
-
-if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
-  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
-fi
-
-commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
-
-if [[ $commands == *"pytest -v -s lora"* ]]; then
-  commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
-fi
-
-#ignore certain kernels tests
-if [[ $commands == *" kernels/core"* ]]; then
-  commands="${commands} \
-  --ignore=kernels/core/test_fused_quant_layernorm.py \
-  --ignore=kernels/core/test_permute_cols.py"
-fi
-
-if [[ $commands == *" kernels/attention"* ]]; then
-  commands="${commands} \
-  --ignore=kernels/attention/test_attention_selector.py \
-  --ignore=kernels/attention/test_encoder_decoder_attn.py \
-  --ignore=kernels/attention/test_flash_attn.py \
-  --ignore=kernels/attention/test_flashinfer.py \
-  --ignore=kernels/attention/test_prefix_prefill.py \
-  --ignore=kernels/attention/test_cascade_flash_attn.py \
-  --ignore=kernels/attention/test_mha_attn.py \
-  --ignore=kernels/attention/test_lightning_attn.py \
-  --ignore=kernels/attention/test_attention.py"
-fi
-
-if [[ $commands == *" kernels/quantization"* ]]; then
-  commands="${commands} \
-  --ignore=kernels/quantization/test_int8_quant.py \
-  --ignore=kernels/quantization/test_machete_mm.py \
-  --ignore=kernels/quantization/test_block_fp8.py \
-  --ignore=kernels/quantization/test_block_int8.py \
-  --ignore=kernels/quantization/test_marlin_gemm.py \
-  --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
-  --ignore=kernels/quantization/test_int8_kernel.py"
-fi
-
-if [[ $commands == *" kernels/mamba"* ]]; then
-  commands="${commands} \
-  --ignore=kernels/mamba/test_mamba_mixer2.py \
-  --ignore=kernels/mamba/test_causal_conv1d.py \
-  --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
-fi
-
-if [[ $commands == *" kernels/moe"* ]]; then
-  commands="${commands} \
-  --ignore=kernels/moe/test_moe.py \
-  --ignore=kernels/moe/test_cutlass_moe.py \
-  --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
+# ---- Command source selection ----
+# Prefer VLLM_TEST_COMMANDS (preserves all inner quoting intact).
+# Fall back to $* for backward compatibility, but warn that inner
+# double-quotes will have been stripped by the calling shell.
+if [[ -n "${VLLM_TEST_COMMANDS:-}" ]]; then
+  commands="${VLLM_TEST_COMMANDS}"
+  echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)"
+else
+  commands="$*"
+  if [[ -z "$commands" ]]; then
+    echo "Error: No test commands provided." >&2
+    echo "Usage:" >&2
+    echo "  Preferred:  VLLM_TEST_COMMANDS='...' bash $0" >&2
+    echo "  Legacy:     bash $0 \"commands here\"" >&2
+    exit 1
+  fi
+  echo "Commands sourced from positional args (legacy mode)"
+  echo "WARNING: Inner double-quotes in the command string may have been"
+  echo "  stripped by the calling shell. If you see syntax errors, switch to:"
+  echo "  export VLLM_TEST_COMMANDS='your commands here'"
+  echo "  bash $0"
 fi

-#ignore certain Entrypoints/openai tests
-if [[ $commands == *" entrypoints/openai "* ]]; then
-  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
-  --ignore=entrypoints/openai/test_audio.py \
-  --ignore=entrypoints/openai/test_shutdown.py \
-  --ignore=entrypoints/openai/test_completion.py \
-  --ignore=entrypoints/openai/test_models.py \
-  --ignore=entrypoints/openai/test_lora_adapters.py \
-  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
-  --ignore=entrypoints/openai/test_root_path.py \
-  --ignore=entrypoints/openai/test_tokenization.py \
-  --ignore=entrypoints/openai/test_prompt_validation.py "}
-fi
+echo "Raw commands: $commands"

-#ignore certain Entrypoints/llm tests
-if [[ $commands == *" entrypoints/llm "* ]]; then
-  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
-  --ignore=entrypoints/llm/test_chat.py \
-  --ignore=entrypoints/llm/test_accuracy.py \
-  --ignore=entrypoints/llm/test_init.py \
-  --ignore=entrypoints/llm/test_prompt_validation.py "}
-fi
+# Fix quoting before ROCm overrides (so overrides see correct structure)
+commands=$(re_quote_pytest_markers "$commands")
+echo "After re-quoting: $commands"

-commands=$(echo "$commands" | sed 's/ \\ / /g')
+commands=$(apply_rocm_test_overrides "$commands")
 echo "Final commands: $commands"

-# --ignore=entrypoints/openai/test_encoder_decoder.py \
-# --ignore=entrypoints/openai/test_embedding.py \
-# --ignore=entrypoints/openai/test_oot_registration.py
-# --ignore=entrypoints/openai/test_accuracy.py \
-# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
-
-
 MYPYTHONPATH=".."

-# Test that we're launching on the machine that has
-# proper access to GPUs
+# Verify GPU access
 render_gid=$(getent group render | cut -d: -f3)
 if [[ -z "$render_gid" ]]; then
  echo "Error: 'render' group not found. This is required for GPU access." >&2
  exit 1
 fi

-if [[ $commands == *"VLLM_TEST_GROUP_NAME=mi325_4-2-node-tests-4-gpus-in-total"* ]]; then
+# --- RDMA device passthrough (conditional) ---
+# If the host has RDMA devices, pass them through so tests like
+# test_moriio_connector can access ibverbs. On hosts without RDMA
+# hardware the tests will gracefully skip via _rdma_available().
+RDMA_FLAGS=""
+if [ -d /dev/infiniband ]; then
+  echo "RDMA devices detected on host, enabling passthrough"
+  RDMA_FLAGS="--device /dev/infiniband --cap-add=IPC_LOCK"
+else
+  echo "No RDMA devices found on host, RDMA tests will be skipped"
+fi

+# --- Route: multi-node vs single-node ---
+if is_multi_node "$commands"; then
+  echo "--- Multi-node job detected"
  export DCKR_VER=$(docker --version | sed 's/Docker version \(.*\), build .*/\1/')

-  if [[ "$commands" =~ ^(.*)"["(.*)"] && ["(.*)"]"$ ]]; then
-      prefix=$( echo "${BASH_REMATCH[1]}" | sed 's/;//g')
-      echo "PREFIX: ${prefix}"
-      export composite_command="(command rocm-smi || true)"
-      myIFS=$IFS
-      IFS=','
-      read -ra node0 <<< ${BASH_REMATCH[2]}
-      read -ra node1 <<< ${BASH_REMATCH[3]}
-      IFS=$myIFS
-      for i in "${!node0[@]}";do 
-        command_node_0=$(echo ${node0[i]} | sed 's/\"//g')
-        command_node_1=$(echo ${node1[i]} | sed 's/\"//g')
-        
-        export commands="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
-        echo "COMMANDS: ${commands}"
-        composite_command=$(echo "${composite_command} && ${commands}")
-      done
-      /bin/bash -c "${composite_command}"
-      cleanup_network
+  # Parse the bracket syntax:  prefix ; [node0_cmds] && [node1_cmds]
+  #   BASH_REMATCH[1] = prefix (everything before first bracket)
+  #   BASH_REMATCH[2] = comma-separated node0 commands
+  #   BASH_REMATCH[3] = comma-separated node1 commands
+  if [[ "$commands" =~ ^(.*)\[(.*)"] && ["(.*)\]$ ]]; then
+    prefix=$(echo "${BASH_REMATCH[1]}" | sed 's/;//g')
+    echo "PREFIX: ${prefix}"
+
+    export composite_command="(command rocm-smi || true)"
+    saved_IFS=$IFS
+    IFS=','
+    read -ra node0 <<< "${BASH_REMATCH[2]}"
+    read -ra node1 <<< "${BASH_REMATCH[3]}"
+    IFS=$saved_IFS
+
+    if [[ ${#node0[@]} -ne ${#node1[@]} ]]; then
+      echo "Warning: node0 has ${#node0[@]} commands, node1 has ${#node1[@]}. They will be paired by index."
+    fi
+
+    for i in "${!node0[@]}"; do
+      command_node_0=$(echo "${node0[i]}" | sed 's/\"//g')
+      command_node_1=$(echo "${node1[i]}" | sed 's/\"//g')
+
+      step_cmd="./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 ${image_name} '${command_node_0}' '${command_node_1}'"
+      echo "COMMANDS: ${step_cmd}"
+      composite_command="${composite_command} && ${step_cmd}"
+    done
+
+    /bin/bash -c "${composite_command}"
+    exit_code=$?
+    cleanup_network
+    handle_pytest_exit "$exit_code"
  else
-      echo "Failed to parse node commands! Exiting."
-      cleanup_network
-      exit 111
+    echo "Multi-node job detected but failed to parse bracket command syntax."
+    echo "Expected format: prefix ; [node0_cmd1, node0_cmd2] && [node1_cmd1, node1_cmd2]"
+    echo "Got: $commands"
+    cleanup_network
+    exit 111
  fi
 else
+  echo "--- Single-node job"
  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
  docker run \
-          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
-          --network=host \
-          --shm-size=16gb \
-          --group-add "$render_gid" \
-          --rm \
-          -e HF_TOKEN \
-          -e AWS_ACCESS_KEY_ID \
-          -e AWS_SECRET_ACCESS_KEY \
-          -v "${HF_CACHE}:${HF_MOUNT}" \
-          -e "HF_HOME=${HF_MOUNT}" \
-          -e "PYTHONPATH=${MYPYTHONPATH}" \
-          --name "${container_name}" \
-          "${image_name}" \
-          /bin/bash -c "${commands}"
+    --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+    $RDMA_FLAGS \
+    --network=host \
+    --shm-size=16gb \
+    --group-add "$render_gid" \
+    --rm \
+    -e HF_TOKEN \
+    -e AWS_ACCESS_KEY_ID \
+    -e AWS_SECRET_ACCESS_KEY \
+    -e BUILDKITE_PARALLEL_JOB \
+    -e BUILDKITE_PARALLEL_JOB_COUNT \
+    -v "${HF_CACHE}:${HF_MOUNT}" \
+    -e "HF_HOME=${HF_MOUNT}" \
+    -e "PYTHONPATH=${MYPYTHONPATH}" \
+    --name "${container_name}" \
+    "${image_name}" \
+    /bin/bash -c "${commands}"
+
+  exit_code=$?
+  handle_pytest_exit "$exit_code"
 fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh
+#!/bin/bash
+set -euox pipefail
+
+export VLLM_CPU_KVCACHE_SPACE=1 
+export VLLM_CPU_CI_ENV=1
+# Reduce sub-processes for acceleration
+export TORCH_COMPILE_DISABLE=1 
+export VLLM_ENABLE_V1_MULTIPROCESSING=0
+
+SDE_ARCHIVE="sde-external-10.7.0-2026-02-18-lin.tar.xz"
+SDE_CHECKSUM="CA3D4086DE4ACB3FAEDF9F57B541C6936B7D5E19AE2BF763B6EA933573A0A217"
+wget "https://downloadmirror.intel.com/913594/${SDE_ARCHIVE}"
+echo "${SDE_CHECKSUM}  ${SDE_ARCHIVE}" | sha256sum --check
+mkdir -p sde
+tar -xvf "./${SDE_ARCHIVE}" --strip-components=1 -C ./sde/
+
+wait_for_pid_and_check_log() {
+    local pid="$1"
+    local log_file="$2"
+    local exit_status
+
+    if [ -z "$pid" ] || [ -z "$log_file" ]; then
+        echo "Usage: wait_for_pid_and_check_log <PID> <LOG_FILE>"
+        return 1
+    fi
+
+    echo "Waiting for process $pid to finish..."
+    
+    # Use the 'wait' command to pause the script until the specific PID exits.
+    # The 'wait' command's own exit status will be that of the waited-for process.
+    if wait "$pid"; then
+        exit_status=$?
+        echo "Process $pid finished with exit status $exit_status (Success)."
+    else
+        exit_status=$?
+        echo "Process $pid finished with exit status $exit_status (Failure)."
+    fi
+
+    if [ "$exit_status" -ne 0 ]; then
+        echo "Process exited with a non-zero status."
+        echo "--- Last few lines of log file: $log_file ---"
+        tail -n 50 "$log_file"
+        echo "---------------------------------------------"
+        return 1 # Indicate failure based on exit status
+    fi
+
+    echo "No errors detected in log file and process exited successfully."
+    return 0
+}
+
+# Test Sky Lake (AVX512F)
+./sde/sde64 -skl -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_0.log 2>&1 &
+PID_TEST_0=$!
+
+# Test Cascade Lake (AVX512F + VNNI)
+./sde/sde64 -clx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_1.log 2>&1 &
+PID_TEST_1=$!
+
+# Test Cooper Lake (AVX512F + VNNI + BF16)
+./sde/sde64 -cpx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_2.log 2>&1 &
+PID_TEST_2=$!
+
+wait_for_pid_and_check_log $PID_TEST_0 test_0.log
+wait_for_pid_and_check_log $PID_TEST_1 test_1.log
+wait_for_pid_and_check_log $PID_TEST_2 test_2.log
--- a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
 #!/bin/bash
 set -euox pipefail
+export VLLM_CPU_CI_ENV=0

 echo "--- PP+TP"
 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
 server_pid=$!
-timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
 vllm bench serve \
    --backend vllm \
    --dataset-name random \
    --model meta-llama/Llama-3.2-3B-Instruct \
    --num-prompts 20 \
+    --result-dir ./test_results \
+    --result-filename tp_pp.json \
+    --save-result \
    --endpoint /v1/completions
-kill -s SIGTERM $server_pid &
+kill -s SIGTERM $server_pid; wait $server_pid || true
+failed_req=$(jq '.failed' ./test_results/tp_pp.json)
+if [ "$failed_req" -ne 0 ]; then
+  echo "Some requests were failed!"
+  exit 1
+fi

 echo "--- DP+TP"
 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
 server_pid=$!
-timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
 vllm bench serve \
    --backend vllm \
    --dataset-name random \
    --model meta-llama/Llama-3.2-3B-Instruct \
    --num-prompts 20 \
+    --result-dir ./test_results \
+    --result-filename dp_pp.json \
+    --save-result \
    --endpoint /v1/completions
-kill -s SIGTERM $server_pid &
+kill -s SIGTERM $server_pid; wait $server_pid || true
+failed_req=$(jq '.failed' ./test_results/dp_pp.json)
+if [ "$failed_req" -ne 0 ]; then
+  echo "Some requests were failed!"
+  exit 1
+fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -34,7 +34,7 @@ function cpu_tests() {
  # offline inference
  docker exec cpu-test bash -c "
    set -e
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m"

  # Run model tests
  docker exec cpu-test bash -c "

--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -27,7 +27,7 @@ function cpu_tests() {
  podman exec -it "$container_id" bash -c "
    export TORCH_COMPILE_DISABLE=1
    set -xve
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m" >> "$HOME"/test_basic.log

  # Run basic model test
  podman exec -it "$container_id" bash -c "
@@ -43,7 +43,7 @@ function cpu_tests() {
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
    # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
-    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
+    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> "$HOME"/test_rest.log
 }

 # All of CPU tests are expected to be finished less than 40 mins.

--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -16,5 +16,5 @@ echo "--- :docker: Building Docker image"
 docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .

 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run --rm --cpuset-cpus=$CORE_RANGE --cpuset-mems=$NUMA_NODE -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g $IMAGE_NAME \
-        timeout $TIMEOUT_VAL bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
+docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \
+        timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@@ -25,5 +25,5 @@ remove_docker_container

 # Run the image and test offline inference
 docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
+    python3 examples/basic/offline_inference/generate.py --model meta-llama/Llama-3.2-1B
 '
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
 #!/bin/bash

-# This script build the CPU docker image and run the offline inference inside the container.
+# This script builds the HPU docker image and runs the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
+#
+# vllm-gaudi compatibility pinning:
+#   The vllm-gaudi plugin is installed on top of the vllm upstream checkout used by this CI job.
+#   When upstream vllm changes its API, the plugin may break before it has been updated.
+#   To handle this, the vllm-gaudi repository maintains a file:
+#     vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT
+#   The first line of that file controls what version of vllm is used inside the Docker image:
+#     - "latest"        : no checkout override; the current Buildkite CI commit is used as-is.
+#     - "<commit SHA>"  : vllm is checked out to that specific commit before building, pinning
+#                         the test to a known-compatible baseline.
+#   To unpin (resume testing against the live vllm tip), set the file content back to "latest".
 set -exuo pipefail

+# Fetch the vllm community commit reference from vllm-gaudi (first line only).
+VLLM_COMMUNITY_COMMIT=$(curl -s \
+  https://raw.githubusercontent.com/vllm-project/vllm-gaudi/vllm/last-good-commit-for-vllm-gaudi/VLLM_COMMUNITY_COMMIT \
+  | head -1 | tr -d '\n')
+
+echo "Using vllm community commit: ${VLLM_COMMUNITY_COMMIT}"
+
 # Try building the docker image
 image_name="hpu/upstream-vllm-ci:${BUILDKITE_COMMIT}"
 container_name="hpu-upstream-vllm-ci-${BUILDKITE_COMMIT}-container"
-cat <<EOF | docker build -t ${image_name} -f - .
+cat <<EOF | docker build -t "${image_name}" -f - .
 FROM gaudi-base-image:latest

 COPY ./ /workspace/vllm

+# If VLLM_COMMUNITY_COMMIT is a specific commit (not "latest"), check it out to pin vllm
+# to the version known to be compatible with vllm-gaudi. When the value is "latest",
+# the current checkout (the Buildkite CI commit) is used unchanged.
+RUN if [ "${VLLM_COMMUNITY_COMMIT}" != "latest" ]; then \
+      cd /workspace/vllm && git fetch --unshallow 2>/dev/null || true && git checkout ${VLLM_COMMUNITY_COMMIT}; \
+    fi
+
 WORKDIR /workspace/vllm

 ENV no_proxy=localhost,127.0.0.1
@@ -39,19 +64,19 @@ EOF
 # functions, while other platforms only need one remove_docker_container
 # function.
 EXITCODE=1
-remove_docker_containers() { docker rm -f ${container_name} || true; }
+remove_docker_containers() { docker rm -f "${container_name}" || true; }
 trap 'remove_docker_containers; exit $EXITCODE;' EXIT
 remove_docker_containers

 echo "Running HPU plugin v1 test"
-docker run --rm --runtime=habana --name=${container_name} --network=host \
+docker run --rm --runtime=habana --name="${container_name}" --network=host \
  -e HABANA_VISIBLE_DEVICES=all \
  -e VLLM_SKIP_WARMUP=true \
  -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
  -e PT_HPU_LAZY_MODE=1 \
  "${image_name}" \
  /bin/bash -c '
-  cd vllm; timeout 120s python -u examples/offline_inference/basic/generate.py --model facebook/opt-125m
+  cd vllm; timeout 120s python -u examples/basic/offline_inference/generate.py --model facebook/opt-125m
 '

 EXITCODE=$?

--- a/.buildkite/scripts/hardware_ci/run-npu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@@ -41,6 +41,7 @@ get_config() {
        echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
        exit 1
    fi
+    # shellcheck source=/dev/null
    source "${TEST_RUN_CONFIG_FILE}"
    echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
    return 0
@@ -48,9 +49,8 @@ get_config() {

 # get test running configuration.
 fetch_vllm_test_cfg
-get_config
 # Check if the function call was successful. If not, exit the script.
-if [ $? -ne 0 ]; then
+if ! get_config; then
  exit 1
 fi

@@ -62,14 +62,14 @@ agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
 echo "agent_idx: ${agent_idx}"
 builder_name="cachebuilder${agent_idx}"
 builder_cache_dir="/mnt/docker-cache${agent_idx}"
-mkdir -p ${builder_cache_dir}
+mkdir -p "${builder_cache_dir}"

 # Try building the docker image
 cat <<EOF | DOCKER_BUILDKIT=1 docker build \
-    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
-    --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
-                           --cache-to type=local,dest=${builder_cache_dir},mode=max \
-    --progress=plain --load -t ${image_name} -f - .
+    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:"${PYPI_CACHE_HOST}" \
+    --builder "${builder_name}" --cache-from type=local,src="${builder_cache_dir}" \
+                           --cache-to type=local,dest="${builder_cache_dir}",mode=max \
+    --progress=plain --load -t "${image_name}" -f - .
 FROM ${BASE_IMAGE_NAME}

 # Define environments
@@ -116,7 +116,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
-    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/devlib && \
    python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/

 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -139,7 +139,7 @@ trap remove_docker_container EXIT
 # Generate corresponding --device args based on BUILDKITE_AGENT_NAME
 # Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
 #   e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
-#   returns --device /dev/davinci0 --device /dev/davinci1
+#   returns one argument per line: --device, /dev/davinciX, ...
 parse_and_gen_devices() {
    local input="$1"
    local index cards_num
@@ -151,29 +151,24 @@ parse_and_gen_devices() {
        return 1
    fi

-    local devices=""
    local i=0
    while (( i < cards_num )); do
        local dev_idx=$(((index - 1)*cards_num + i ))
-        devices="$devices --device /dev/davinci${dev_idx}"
+        printf '%s\n' "--device"
+        printf '%s\n' "/dev/davinci${dev_idx}"
        ((i++))
    done
-
-    # trim leading space
-    devices="${devices#"${devices%%[![:space:]]*}"}"
-    # Output devices: assigned to the caller variable
-    printf '%s' "$devices"
 }

-devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
+mapfile -t device_args < <(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1

 # Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
 # This test checks whether the OOT platform interface is functioning properly in conjunction with
 # the hardware plugin vllm-ascend.
 model_cache_dir=/mnt/modelscope${agent_idx}
-mkdir -p ${model_cache_dir}
+mkdir -p "${model_cache_dir}"
 docker run \
-    ${devices} \
+    "${device_args[@]}" \
    --device /dev/davinci_manager \
    --device /dev/devmm_svm \
    --device /dev/hisi_hdc \
@@ -182,7 +177,7 @@ docker run \
    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
    -v /etc/ascend_install.info:/etc/ascend_install.info \
-    -v ${model_cache_dir}:/root/.cache/modelscope \
+    -v "${model_cache_dir}":/root/.cache/modelscope \
    --entrypoint="" \
    --name "${container_name}" \
    "${image_name}" \

--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"


--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.11" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"


--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"

 # Try building the docker image
-docker build -t ${image_name} -f docker/Dockerfile.xpu .
+docker build -t "${image_name}" -f docker/Dockerfile.xpu .

 # Setup cleanup
 remove_docker_container() {
@@ -34,17 +34,17 @@ docker run \
    set -e
    echo $ZE_AFFINITY_MASK
    pip install tblib==3.1.0
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
-    python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
-    python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
+    python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8
+    python3 examples/basic/offline_inference/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4  --block-size 64 --enforce-eager
+    python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2
+    python3 examples/basic/offline_inference/generate.py --model ibm-research/PowerMoE-3b  --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel
    cd tests
-    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py
+    pytest -v -s v1/core --ignore=v1/core/test_reset_prefix_cache_e2e.py --ignore=v1/core/test_scheduler_e2e.py
    pytest -v -s v1/engine
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py