Merge tag 'v0.15.0rc1' into v0.15.0rc1-ori

82e40fb7 · zhuwenwen · 30a1922e · 58996f35 · 82e40fb7 · 82e40fb7
Commit 82e40fb7 authored Jan 27, 2026 by zhuwenwen
20 changed files
--- a/.buildkite/ci_config.yaml
+++ b/.buildkite/ci_config.yaml
 name: vllm_ci
 job_dirs:
-  - ".buildkite/test_areas"
  - ".buildkite/image_build"
+  - ".buildkite/test_areas"
+  - ".buildkite/hardware_tests"
 run_all_patterns:
  - "docker/Dockerfile"
  - "CMakeLists.txt"

--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
+group: Hardware
+steps:
+  - label: "AMD: :docker: build image"
+    device: amd_cpu
+    no_plugin: true
+    commands:
+    - >
+      docker build
+      --build-arg max_jobs=16
+      --build-arg REMOTE_VLLM=1
+      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942'
+      --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
+      --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
+      -f docker/Dockerfile.rocm
+      --target test
+      --no-cache
+      --progress plain .
+    - docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 1
+        - exit_status: -10  # Agent was lost
+          limit: 1
+        - exit_status: 1  # Machine occasionally fail
+          limit: 1
--- a/.buildkite/hardware_tests/arm.yaml
+++ b/.buildkite/hardware_tests/arm.yaml
+group: Hardware
+steps:
+  - label: "Arm CPU Test"
+    soft_fail: true
+    device: arm_cpu
+    no_plugin: true
+    commands: 
+    - bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
--- a/.buildkite/hardware_tests/ascend_npu.yaml
+++ b/.buildkite/hardware_tests/ascend_npu.yaml
+group: Hardware
+depends_on: ~
+steps:
+  - label: "Ascend NPU Test"
+    soft_fail: true
+    timeout_in_minutes: 20
+    no_plugin: true
+    device: ascend_npu
+    commands: 
+    - bash .buildkite/scripts/hardware_ci/run-npu-test.sh
--- a/.buildkite/hardware_tests/gh200.yaml
+++ b/.buildkite/hardware_tests/gh200.yaml
+group: Hardware
+steps:
+  - label: "GH200 Test"
+    soft_fail: true
+    device: gh200
+    no_plugin: true
+    optional: true
+    commands: 
+    - nvidia-smi 
+    - bash .buildkite/scripts/hardware_ci/run-gh200-test.sh
--- a/.buildkite/hardware_tests/intel.yaml
+++ b/.buildkite/hardware_tests/intel.yaml
+group: Hardware
+depends_on: ~
+steps:
+  - label: "Intel CPU Test"
+    soft_fail: true
+    device: intel_cpu
+    no_plugin: true
+    commands: 
+    - bash .buildkite/scripts/hardware_ci/run-cpu-test.sh
+  - label: "Intel HPU Test"
+    soft_fail: true
+    device: intel_hpu
+    no_plugin: true
+    commands: 
+    - bash .buildkite/scripts/hardware_ci/run-hpu-test.sh
+  - label: "Intel GPU Test"
+    soft_fail: true
+    device: intel_gpu
+    no_plugin: true
+    commands: 
+    - bash .buildkite/scripts/hardware_ci/run-xpu-test.sh
--- a/.buildkite/image_build/image_build.sh
+++ b/.buildkite/image_build/image_build.sh
 #!/bin/bash
-set -e
+set -euo pipefail
-if [[ $# -lt 8 ]]; then
+# replace invalid characters in Docker image tags and truncate to 128 chars
+clean_docker_tag() {
+    local input="$1"
+    echo "$input" | sed 's/[^a-zA-Z0-9._-]/_/g' | cut -c1-128
+}
+print_usage_and_exit() {
    echo "Usage: $0 <registry> <repo> <commit> <branch> <vllm_use_precompiled> <vllm_merge_base_commit> <cache_from> <cache_to>"
    exit 1
+}
+print_instance_info() {
+    echo ""
+    echo "=== Debug: Instance Information ==="
+    # Get IMDSv2 token
+    if TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
+            -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null); then
+        AMI_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
+            http://169.254.169.254/latest/meta-data/ami-id 2>/dev/null || echo "unknown")
+        INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
+            http://169.254.169.254/latest/meta-data/instance-type 2>/dev/null || echo "unknown")
+        INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
+            http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown")
+        AZ=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
+            http://169.254.169.254/latest/meta-data/placement/availability-zone 2>/dev/null || echo "unknown")
+        echo "AMI ID:        ${AMI_ID}"
+        echo "Instance Type: ${INSTANCE_TYPE}"
+        echo "Instance ID:   ${INSTANCE_ID}"
+        echo "AZ:            ${AZ}"
+    else
+        echo "Not running on EC2 or IMDS not available"
+    fi
+    # Check for warm cache AMI (marker file baked into custom AMI)
+    if [[ -f /etc/vllm-ami-info ]]; then
+        echo "Cache:         warm (custom vLLM AMI)"
+        cat /etc/vllm-ami-info
+    else
+        echo "Cache:         cold (standard AMI)"
+    fi
+    echo "==================================="
+    echo ""
+}
+setup_buildx_builder() {
+    echo "--- :buildkite: Setting up buildx builder"
+    if [[ -S "${BUILDKIT_SOCKET}" ]]; then
+        # Custom AMI with standalone buildkitd - use remote driver for warm cache
+        echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}"
+        echo "Using remote driver to connect to buildkitd (warm cache available)"
+        if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then
+            echo "Using existing baked-vllm-builder"
+            docker buildx use baked-vllm-builder
+        else
+            echo "Creating baked-vllm-builder with remote driver"
+            docker buildx create \
+                --name baked-vllm-builder \
+                --driver remote \
+                --use \
+                "unix://${BUILDKIT_SOCKET}"
+        fi
+        docker buildx inspect --bootstrap
+    elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
+        # Existing builder available
+        echo "Using existing builder: ${BUILDER_NAME}"
+        docker buildx use "${BUILDER_NAME}"
+        docker buildx inspect --bootstrap
+    else
+        # No local buildkitd, no existing builder - create new docker-container builder
+        echo "No local buildkitd found, using docker-container driver"
+        docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use
+        docker buildx inspect --bootstrap
+    fi
+    # builder info
+    echo "Active builder:"
+    docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls
+}
+check_and_skip_if_image_exists() {
+    if [[ -n "${IMAGE_TAG:-}" ]]; then
+        echo "--- :mag: Checking if image exists"
+        if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
+            echo "Image already exists: ${IMAGE_TAG}"
+            echo "Skipping build"
+            exit 0
+        fi
+        echo "Image not found, proceeding with build"
+    fi
+}
+ecr_login() {
+    aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+    aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+}
+prepare_cache_tags() {
+    # resolve and set: CACHE_TO, CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN
+    TEST_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache"
+    MAIN_CACHE_ECR="936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
+    if [[ "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
+        if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
+            cache="${MAIN_CACHE_ECR}:latest"
+        else
+            clean_branch=$(clean_docker_tag "$BUILDKITE_BRANCH")
+            cache="${TEST_CACHE_ECR}:${clean_branch}"
+        fi
+        CACHE_TO="$cache"
+        CACHE_FROM="$cache"
+        CACHE_FROM_BASE_BRANCH="$cache"
+    else
+        CACHE_TO="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
+        CACHE_FROM="${TEST_CACHE_ECR}:pr-${BUILDKITE_PULL_REQUEST}"
+        if [[ "$BUILDKITE_PULL_REQUEST_BASE_BRANCH" == "main" ]]; then
+            CACHE_FROM_BASE_BRANCH="${MAIN_CACHE_ECR}:latest"
+        else
+            clean_base=$(clean_docker_tag "$BUILDKITE_PULL_REQUEST_BASE_BRANCH")
+            CACHE_FROM_BASE_BRANCH="${TEST_CACHE_ECR}:${clean_base}"
+        fi
+    fi
+    CACHE_FROM_MAIN="${MAIN_CACHE_ECR}:latest"
+    export CACHE_TO CACHE_FROM CACHE_FROM_BASE_BRANCH CACHE_FROM_MAIN
+}
+resolve_parent_commit() {
+    if [[ -z "${PARENT_COMMIT:-}" ]]; then
+        PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "")
+        if [[ -n "${PARENT_COMMIT}" ]]; then
+            echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}"
+            export PARENT_COMMIT
+        else
+            echo "Could not determine parent commit (may be first commit in repo)"
+        fi
+    else
+        echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}"
+    fi
+}
+print_bake_config() {
+    echo "--- :page_facing_up: Resolved bake configuration"
+    BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
+    docker buildx bake -f "${VLLM_BAKE_FILE}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
+    echo "Saved bake config to ${BAKE_CONFIG_FILE}"
+    echo "--- :arrow_down: Uploading bake config to Buildkite"
+    buildkite-agent artifact upload "${BAKE_CONFIG_FILE}"
+}
+#################################
+#         Main Script           #
+#################################
+print_instance_info
+if [[ $# -lt 7 ]]; then
+    print_usage_and_exit
 fi
+# input args
 REGISTRY=$1
 REPO=$2
 BUILDKITE_COMMIT=$3
 BRANCH=$4
 VLLM_USE_PRECOMPILED=$5
 VLLM_MERGE_BASE_COMMIT=$6
-CACHE_FROM=$7
+IMAGE_TAG=$7
-CACHE_TO=$8
+IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional
-# authenticate with AWS ECR
+# build config
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY
+TARGET="test-ci"
-aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}"
+VLLM_BAKE_FILE="${VLLM_BAKE_FILE:-docker/docker-bake.hcl}"
-# docker buildx 
+BUILDER_NAME="${BUILDER_NAME:-vllm-builder}"
-docker buildx create --name vllm-builder --driver docker-container --use
+CI_HCL_PATH="/tmp/ci.hcl"
-docker buildx inspect --bootstrap
+BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock"
-docker buildx ls
+prepare_cache_tags
-# skip build if image already exists
+ecr_login
-if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then
-  echo "Image not found, proceeding with build..."
+# Environment info (for docs and human readers)
-else
+#   CI_HCL_URL          - URL to ci.hcl (default: from ci-infra main branch)
-  echo "Image found"
+#   VLLM_CI_BRANCH      - ci-infra branch to use (default: main)
-  exit 0
+#   VLLM_BAKE_FILE      - Path to vLLM's bake file (default: docker/docker-bake.hcl)
-fi
+#   BUILDER_NAME        - Name for buildx builder (default: vllm-builder)
+#
+# Build configuration (exported as environment variables for bake):
+export BUILDKITE_COMMIT
+export PARENT_COMMIT
+export IMAGE_TAG
+export IMAGE_TAG_LATEST
+export CACHE_FROM
+export CACHE_FROM_BASE_BRANCH
+export CACHE_FROM_MAIN
+export CACHE_TO
+export VLLM_USE_PRECOMPILED
+export VLLM_MERGE_BASE_COMMIT
+# print args
+echo "--- :mag: Arguments"
+echo "REGISTRY: ${REGISTRY}"
+echo "REPO: ${REPO}"
+echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
+echo "BRANCH: ${BRANCH}"
+echo "VLLM_USE_PRECOMPILED: ${VLLM_USE_PRECOMPILED}"
+echo "VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
+echo "IMAGE_TAG: ${IMAGE_TAG}"
+echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}"
+# print build configuration
+echo "--- :mag: Build configuration"
+echo "TARGET: ${TARGET}"
+echo "CI HCL URL: ${CI_HCL_URL}"
+echo "vLLM bake file: ${VLLM_BAKE_FILE}"
+echo "BUILDER_NAME: ${BUILDER_NAME}"
+echo "CI_HCL_PATH: ${CI_HCL_PATH}"
+echo "BUILDKIT_SOCKET: ${BUILDKIT_SOCKET}"
+echo "--- :mag: Cache tags"
+echo "CACHE_TO: ${CACHE_TO}"
+echo "CACHE_FROM: ${CACHE_FROM}"
+echo "CACHE_FROM_BASE_BRANCH: ${CACHE_FROM_BASE_BRANCH}"
+echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
+check_and_skip_if_image_exists
-if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then
+echo "--- :docker: Setting up Docker buildx bake"
-  merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}"
+echo "Target: ${TARGET}"
-else
+echo "CI HCL URL: ${CI_HCL_URL}"
-  merge_base_commit_build_args=""
+echo "vLLM bake file: ${VLLM_BAKE_FILE}"
+if [[ ! -f "${VLLM_BAKE_FILE}" ]]; then
+    echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE}"
+    echo "Make sure you're running from the vLLM repository root"
+    exit 1
 fi
-# build
+echo "--- :arrow_down: Downloading ci.hcl"
-docker buildx build --file docker/Dockerfile \
+curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}"
-  --build-arg max_jobs=16 \
+echo "Downloaded to ${CI_HCL_PATH}"
-  --build-arg buildkite_commit=$BUILDKITE_COMMIT \
-  --build-arg USE_SCCACHE=1 \
+setup_buildx_builder
-  --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \
-  --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \
+# Compute parent commit for cache fallback (if not already set)
-  --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \
+resolve_parent_commit
-  ${merge_base_commit_build_args} \
+export PARENT_COMMIT
-  --cache-from type=registry,ref=${CACHE_FROM},mode=max \
-  --cache-to type=registry,ref=${CACHE_TO},mode=max \
+print_bake_config
-  --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \
-  $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \
+echo "--- :docker: Building ${TARGET}"
-  --push \
+docker --debug buildx bake -f "${VLLM_BAKE_FILE}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}"
-  --target test \
-  --progress plain .
+echo "--- :white_check_mark: Build complete"
--- a/.buildkite/image_build/image_build.yaml
+++ b/.buildkite/image_build/image_build.yaml
@@ -4,7 +4,8 @@ steps:
    key: image-build
    depends_on: []
    commands:
-    - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO
+    - if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi
+    - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG_LATEST; fi
    retry:
      automatic:
        - exit_status: -1  # Agent was lost

--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1131,7 +1131,7 @@ steps:
  - csrc/quantization/cutlass_w8a8/moe/
  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/attention/backends/mla/cutlass_mla.py

--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1017,7 +1017,7 @@ steps:
  - csrc/quantization/cutlass_w8a8/moe/
  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -1316,7 +1316,7 @@ steps:
  - pytest -v -s distributed/test_distributed_oot.py
  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
-  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
+  - pytest -v -s plugins/lora_resolvers # unit tests for lora resolver plugins
 - label: Pipeline + Context Parallelism Test # 45min
  timeout_in_minutes: 60

--- a/.buildkite/test_areas/attention.yaml
+++ b/.buildkite/test_areas/attention.yaml
@@ -4,7 +4,7 @@ depends_on:
 steps:
 - label: V1 attention (H100)
  timeout_in_minutes: 30
-  gpu: h100
+  device: h100
  source_file_dependencies:
    - vllm/config/attention.py
    - vllm/model_executor/layers/attention
@@ -15,7 +15,7 @@ steps:
 - label: V1 attention (B200)
  timeout_in_minutes: 30
-  gpu: b200
+  device: b200
  source_file_dependencies:
    - vllm/config/attention.py
    - vllm/model_executor/layers/attention

--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Fusion and Compile Tests (B200)
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  device: b200
  source_file_dependencies:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -26,7 +26,7 @@ steps:
    - nvidia-smi
    - pytest -v -s tests/compile/test_fusion_attn.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
-    # this runner has 2 GPUs available even though num_gpus=2 is not set
+    # this runner has 2 GPUs available even though num_devices=2 is not set
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
    # Wrap with quotes to escape yaml
@@ -37,9 +37,9 @@ steps:
 - label: Fusion E2E (2 GPUs)(B200)
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  device: b200
  optional: true
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py

--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -5,7 +5,7 @@ steps:
 - label: Distributed Comm Ops
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - vllm/distributed
  - tests/distributed
@@ -18,7 +18,7 @@ steps:
 - label: Distributed (2 GPUs)
  timeout_in_minutes: 90
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - vllm/compilation/
  - vllm/distributed/
@@ -54,7 +54,7 @@ steps:
 - label: Distributed Tests (4 GPUs)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
  - tests/distributed/test_utils
@@ -103,8 +103,8 @@ steps:
 - label: Distributed Tests (8 GPUs)(H100)
  timeout_in_minutes: 10
-  gpu: h100
+  device: h100
-  num_gpus: 8
+  num_devices: 8
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - examples/offline_inference/torchrun_dp_example.py
@@ -120,9 +120,9 @@ steps:
  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
 - label: Distributed Tests (4 GPUs)(A100)
-  gpu: a100
+  device: a100
  optional: true
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/
  commands:
@@ -133,26 +133,34 @@ steps:
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py
- label: Distributed Tests (2 GPUs)(H200)
+- label: Sequence Parallel Tests (H100)
-  gpu: h200
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  device: h100
+  optional: true
+  num_devices: 2
+  commands:
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    # Run sequence parallel tests
+    - pytest -v -s tests/distributed/test_sequence_parallel.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+- label: Distributed Tests (2 GPUs)(H100)
+  device: h100
  optional: true
  working_dir: "/vllm-workspace/"
-  num_gpus: 2
+  num_devices: 2
  commands:
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
-    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
+    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py
 - label: Distributed Tests (2 GPUs)(B200)
-  gpu: b200
+  device: b200
  optional: true
  working_dir: "/vllm-workspace/"
-  num_gpus: 2
+  num_devices: 2
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
@@ -161,8 +169,9 @@ steps:
 - label: 2 Node Test (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_devices: 2
  num_nodes: 2
+  no_plugin: true
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
@@ -176,7 +185,7 @@ steps:
 - label: Distributed NixlConnector PD accuracy (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
    - tests/v1/kv_connector/nixl_integration/
@@ -184,10 +193,21 @@ steps:
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
+- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_devices: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
 - label: Pipeline + Context Parallelism (4 GPUs))
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
@@ -197,3 +217,45 @@ steps:
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
  - pytest -v -s distributed/test_pipeline_parallel.py
+- label: Hopper Fusion E2E Tests (H100)
+  timeout_in_minutes: 70
+  working_dir: "/vllm-workspace/"
+  device: h100
+  optional: true
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusion_attn.py
+  commands:
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    # skip Llama-4 since it does not fit on this device
+    - pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
+- label: Hopper Fusion Distributed E2E Tests (2xH100)
+  timeout_in_minutes: 70
+  working_dir: "/vllm-workspace/"
+  device: h100
+  optional: true
+  num_devices: 2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  commands:
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    # Run all e2e fusion tests
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -4,27 +4,27 @@ depends_on:
 steps:
 - label: DeepSeek V2-Lite Accuracy
  timeout_in_minutes: 60
-  gpu: h100
+  device: h100
  optional: true
-  num_gpus: 4
+  num_devices: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
 - label: Qwen3-30B-A3B-FP8-block Accuracy
  timeout_in_minutes: 60
-  gpu: h100
+  device: h100
  optional: true
-  num_gpus: 4
+  num_devices: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
 - label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
  timeout_in_minutes: 60
-  gpu: b200
+  device: b200
  optional: true
-  num_gpus: 2
+  num_devices: 2
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
@@ -33,10 +33,11 @@ steps:
  timeout_in_minutes: 30
  optional: true
  soft_fail: true
-  num_gpus: 2
+  num_devices: 2
  working_dir: "/vllm-workspace"
  source_file_dependencies:
  - vllm/
  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
+    - nvidia-smi
    - bash .buildkite/scripts/run-prime-rl-test.sh
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -23,4 +23,8 @@ steps:
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
    - pytest -v -s v1/e2e
-    - pytest -v -s v1/engine
+    # Run this test standalone for now;
+    # need to untangle use (implicit) use of spawn/fork across the tests.
+    - pytest -v -s v1/engine/test_preprocess_error_handling.py
+    # Run the rest of v1/engine tests
+    - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
--- a/.buildkite/test_areas/expert_parallelism.yaml
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -14,7 +14,7 @@ steps:
 - label: EPLB Execution
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/distributed/eplb
  - tests/distributed/test_eplb_execute.py

--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -57,8 +57,8 @@ steps:
 - label: Kernels DeepGEMM Test (H100)
  timeout_in_minutes: 45
-  gpu: h100
+  device: h100
-  num_gpus: 1
+  num_devices: 1
  source_file_dependencies:
  - tools/install_deepgemm.sh
  - vllm/utils/deep_gemm.py
@@ -77,7 +77,7 @@ steps:
 - label: Kernels (B200)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  device: b200
  # optional: true
  source_file_dependencies:
  - csrc/quantization/fp4/
@@ -85,7 +85,7 @@ steps:
  - csrc/quantization/cutlass_w8a8/moe/
  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
-  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -115,3 +115,54 @@ steps:
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
+    # e2e
+    - pytest -v -s tests/models/quantization/test_nvfp4.py
+- label: Kernels Helion Test
+  timeout_in_minutes: 30
+  device: h100
+  source_file_dependencies:
+  - vllm/utils/import_utils.py
+  - tests/kernels/helion/
+  commands:
+    - pip install helion
+    - pytest -v -s kernels/helion/
+- label: Kernels FP8 MoE Test (1 H100)
+  timeout_in_minutes: 90
+  device: h100
+  num_devices: 1
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_cutlass_moe.py
+    - pytest -v -s kernels/moe/test_flashinfer.py
+    - pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
+    - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
+    - pytest -v -s kernels/moe/test_moe.py
+    # - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
+    - pytest -v -s kernels/moe/test_block_int8.py
+    - pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
+    - pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
+- label: Kernels FP8 MoE Test (2 H100s)
+  timeout_in_minutes: 90
+  device: h100
+  num_devices: 2
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
+    - pytest -v -s kernels/moe/test_deepep_moe.py
+    - pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
+    # - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
+- label: Kernels Fp4 MoE Test (B200)
+  timeout_in_minutes: 60
+  device: b200
+  num_devices: 1
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_cutedsl_moe.py
+    - pytest -v -s kernels/moe/test_flashinfer_moe.py
+    - pytest -v -s kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s kernels/moe/test_ocp_mx_moe.py
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -12,9 +12,9 @@ steps:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt
 - label: LM Eval Large Models (4 GPUs)(A100)
-  gpu: a100
+  device: a100
  optional: true
-  num_gpus: 4
+  num_devices: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
@@ -24,9 +24,9 @@ steps:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 - label: LM Eval Large Models (4 GPUs)(H100)
-  gpu: h100
+  device: h100
  optional: true
-  num_gpus: 4
+  num_devices: 4
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
@@ -37,10 +37,39 @@ steps:
 - label: LM Eval Small Models (B200)
  timeout_in_minutes: 120
-  gpu: b200
+  device: b200
  optional: true
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
+- label: LM Eval Large Models (H200)
+  timeout_in_minutes: 60
+  device: h200
+  optional: true
+  num_devices: 8
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt
+- label: MoE Refactor Integration Test (H100 - TEMPORARY)
+  device: h100
+  optional: true
+  num_devices: 2
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt
+- label: MoE Refactor Integration Test (B200 - TEMPORARY)
+  gpu: b200
+  optional: true
+  num_devices: 2
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt
+- label: MoE Refactor Integration Test (B200 DP - TEMPORARY)
+  device: b200
+  optional: true
+  num_devices: 2
+  commands:
+    - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor-dp-ep/config-b200.txt
--- a/.buildkite/test_areas/lora.yaml
+++ b/.buildkite/test_areas/lora.yaml
@@ -14,7 +14,7 @@ steps:
 - label: LoRA TP (Distributed)
  timeout_in_minutes: 30
-  num_gpus: 4
+  num_devices: 4
  source_file_dependencies:
  - vllm/lora
  - tests/lora

--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -31,7 +31,7 @@ steps:
  source_file_dependencies:
    - vllm/
    - tests/v1
-  no_gpu: true
+  device: cpu
  commands:
    # split the test to avoid interference
    - pytest -v -s -m 'cpu_test' v1/core
@@ -82,7 +82,7 @@ steps:
 - label: Metrics, Tracing (2 GPUs)
  timeout_in_minutes: 20
-  num_gpus: 2
+  num_devices: 2
  source_file_dependencies:
  - vllm/
  - tests/v1/tracing
@@ -127,7 +127,7 @@ steps:
  - tests/tool_parsers
  - tests/transformers_utils
  - tests/config
-  no_gpu: true
+  device: cpu
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
@@ -142,7 +142,7 @@ steps:
 - label: GPT-OSS Eval (B200)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
-  gpu: b200
+  device: b200
  optional: true
  source_file_dependencies:
  - tests/evals/gpt_oss
@@ -155,7 +155,7 @@ steps:
 - label: Batch Invariance (H100)
  timeout_in_minutes: 25
-  gpu: h100
+  device: h100
  source_file_dependencies:
    - vllm/v1/attention
    - vllm/model_executor/layers