Merge tag 'v0.12.0' into v0.12.0-dev

41199996 · zhuwenwen · 31021d81 · 4fd9d6a8 · 41199996 · 41199996
Commit 41199996 authored Dec 13, 2025 by zhuwenwen
20 changed files
--- a/.buildkite/scripts/tpu/quantized_v6e_1.env
+++ b/.buildkite/scripts/tpu/quantized_v6e_1.env
@@ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024
 TENSOR_PARALLEL_SIZE=1
 MAX_MODEL_LEN=2048
 DOWNLOAD_DIR=/mnt/disks/persist
-EXPECTED_THROUGHPUT=10.0
+EXPECTED_THROUGHPUT=8.7
 INPUT_LEN=1800
 OUTPUT_LEN=128
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@@ -42,7 +42,7 @@ echo "lanching vllm..."
 echo "logging to $VLLM_LOG"
 echo
-VLLM_USE_V1=1 vllm serve $MODEL \
+vllm serve $MODEL \
 --seed 42 \
 --max-num-seqs $MAX_NUM_SEQS \
 --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \

--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -2,6 +2,28 @@
 set -ex
+# ======== part 0: setup ========
+BUCKET="vllm-wheels"
+INDICES_OUTPUT_DIR="indices"
+DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
+PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
+SUBPATH=$BUILDKITE_COMMIT
+S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
+# detect if python3.10+ is available
+has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)")
+if [[ "$has_new_python" -eq 0 ]]; then
+    # use new python from docker
+    docker pull python:3-slim
+    PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
+fi
+echo "Using python interpreter: $PYTHON"
+echo "Python version: $($PYTHON --version)"
+# ========= part 1: collect, rename & upload the wheel ==========
 # Assume wheels are in artifacts/dist/*.whl
 wheel_files=(artifacts/dist/*.whl)
@@ -10,82 +32,69 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
  echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
  exit 1
 fi
-# Get the single wheel file
 wheel="${wheel_files[0]}"
-# Detect architecture and rename 'linux' to appropriate manylinux version
+# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
-arch=$(uname -m)
+# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
-if [[ $arch == "x86_64" ]]; then
+manylinux_version="manylinux_2_31"
-    manylinux_version="manylinux1"
-elif [[ $arch == "aarch64" ]]; then
-    manylinux_version="manylinux2014"
-else
-    echo "Warning: Unknown architecture $arch, using manylinux1 as default"
-    manylinux_version="manylinux1"
-fi
 # Rename 'linux' to the appropriate manylinux version in the wheel filename
+if [[ "$wheel" != *"linux"* ]]; then
+  echo "Error: Wheel filename does not contain 'linux': $wheel"
+  exit 1
+fi
 new_wheel="${wheel/linux/$manylinux_version}"
 mv -- "$wheel" "$new_wheel"
 wheel="$new_wheel"
+echo "Renamed wheel to: $wheel"
 # Extract the version from the wheel
 version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-echo "Version: $version"
+echo "Version in wheel: $version"
+pure_version="${version%%+*}"
-normal_wheel="$wheel" # Save the original wheel filename
+echo "Pure version (without variant): $pure_version"
-# If the version contains "dev", rename it to v1.0.0.dev for consistency
-if [[ $version == *dev* ]]; then
-    suffix="${version##*.}"
-    if [[ $suffix == cu* ]]; then
-        new_version="1.0.0.dev+${suffix}"
-    else
-        new_version="1.0.0.dev"
-    fi
-    new_wheel="${wheel/$version/$new_version}"
-    # use cp to keep both files in the artifacts directory
-    cp -- "$wheel" "$new_wheel"
-    wheel="$new_wheel"
-    version="$new_version"
-fi
-# Upload the wheel to S3
+# copy wheel to its own bucket
-python3 .buildkite/generate_index.py --wheel "$normal_wheel"
+aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
-# generate index for this commit
+# ========= part 2: generate and upload indices ==========
-aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+# generate indices for all existing wheels in the commit directory
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+# this script might be run multiple times if there are multiple variants being built
+# so we need to guarantee there is little chance for "TOCTOU" issues
+# i.e., one process is generating indices while another is uploading a new wheel
+# so we need to ensure no time-consuming operations happen below
-if [[ $normal_wheel == *"cu126"* ]]; then
+# list all wheels in the commit directory
-    # if $normal_wheel matches cu126, do not upload the index.html
+echo "Existing wheels on S3:"
-    echo "Skipping index files for cu126 wheels"
+aws s3 ls "$S3_COMMIT_PREFIX"
-elif [[ $normal_wheel == *"cu128"* ]]; then
+obj_json="objects.json"
-    # if $normal_wheel matches cu128, do not upload the index.html
+aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
-    echo "Skipping index files for cu128 wheels"
+mkdir -p "$INDICES_OUTPUT_DIR"
+# call script to generate indicies for all existing wheels
+# this indices have relative paths that could work as long as it is next to the wheel directory in s3
+# i.e., the wheels are always in s3://vllm-wheels/<commit>/
+# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
+if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
+    alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
 else
-    # only upload index.html for cu129 wheels (default wheels) as it
+    alias_arg=""
-    # is available on both x86 and arm64
-    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
-    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 fi
-# generate index for nightly
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
-aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
-if [[ $normal_wheel == *"cu126"* ]]; then
+# copy indices to /<commit>/ unconditionally
-    # if $normal_wheel matches cu126, do not upload the index.html
+echo "Uploading indices to $S3_COMMIT_PREFIX"
-    echo "Skipping index files for cu126 wheels"
+aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
-elif [[ $normal_wheel == *"cu128"* ]]; then
-    # if $normal_wheel matches cu128, do not upload the index.html
+# copy to /nightly/ only if it is on the main branch and not a PR 
-    echo "Skipping index files for cu128 wheels"
+if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
-else
+    echo "Uploading indices to overwrite /nightly/"
-    # only upload index.html for cu129 wheels (default wheels) as it
+    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
-    # is available on both x86 and arm64
-    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 fi
-aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
+# copy to /<pure_version>/ only if it does not have "dev" in the version
-aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
+if [[ "$version" != *"dev"* ]]; then
+    echo "Uploading indices to overwrite /$pure_version/"
+    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
+fi
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
+# In this file, you can add more tests to run either by adding a new step or
+# adding a new command to an existing step. See different options here for examples.
+# This script will be feed into Jinja template in `test-template-aws.j2` at
+# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
+# to generate the final pipeline yaml file.
+# Documentation
+# label(str): the name of the test. emojis allowed.
+# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
+# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
+# fast_check_only(bool): run this test on the fastcheck pipeline only
+# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
+# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
+# command(str): the single command to run for tests. incompatible with commands.
+# commands(list): the list of commands to run for the test. incompatible with command.
+# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
+# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
+# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
+# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
+#     in this case, commands must be specified. the first command runs on the first host, the second
+#     command runs on the second host.
+# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
+# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
+#     and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
+# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
+# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
+# When adding a test
+# - If the test belongs to an existing group, add it there
+# - If the test is short, add to any existing step
+# - If the test takes more than 10min, then it is okay to create a new step.
+#   Note that all steps execute in parallel.
+steps:
+##### fast check tests  #####
+- label: Pytorch Nightly Dependency Override Check # 2min
+  # if this test fails, it means the nightly torch version is not compatible with some
+  # of the dependencies. Please check the error message and add the package to whitelist
+  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  agent_pool: mi325_1
+  grade: Blocking
+  soft_fail: true
+  source_file_dependencies:
+  - requirements/nightly_torch_test.txt
+  commands:
+  - bash standalone_tests/pytorch_nightly_dependency.sh
+- label: Async Engine, Inputs, Utils, Worker Test # 10min
+  timeout_in_minutes: 15
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  agent_pool: mi325_1
+  grade: Blocking
+  source_file_dependencies:
+  - vllm/
+  - tests/multimodal
+  - tests/utils_
+  commands:
+  - pytest -v -s -m 'not cpu_test' multimodal
+  - pytest -v -s utils_
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  agent_pool: mi325_1
+  grade: Blocking
+  source_file_dependencies:
+  - vllm/
+  - tests/test_inputs.py
+  - tests/test_outputs.py
+  - tests/multimodal
+  - tests/standalone_tests/lazy_imports.py
+  - tests/tokenizers_
+  - tests/transformers_utils
+  - tests/config
+  no_gpu: true
+  commands:
+  - python3 standalone_tests/lazy_imports.py
+  - pytest -v -s test_inputs.py
+  - pytest -v -s test_outputs.py
+  - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s tokenizers_
+  - pytest -v -s transformers_utils
+  - pytest -v -s config
+- label: Python-only Installation Test # 10min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - tests/standalone_tests/python_only_compile.sh
+  - setup.py
+  commands:
+  - bash standalone_tests/python_only_compile.sh
+- label: Basic Correctness Test # 20min
+  timeout_in_minutes: 30
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/basic_correctness/test_basic_correctness
+  - tests/basic_correctness/test_cpu_offload
+  - tests/basic_correctness/test_cumem.py
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s basic_correctness/test_cumem.py
+  - pytest -v -s basic_correctness/test_basic_correctness.py
+  - pytest -v -s basic_correctness/test_cpu_offload.py
+- label: Entrypoints Unit Tests # 5min
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  agent_pool: mi325_1
+  grade: Blocking
+  timeout_in_minutes: 10
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  source_file_dependencies:
+  - vllm/entrypoints
+  - tests/entrypoints/
+  commands:
+  - pytest -v -s entrypoints/openai/tool_parsers
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+- label: Entrypoints Integration Test (LLM) # 30min
+  timeout_in_minutes: 40
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/llm
+  - tests/entrypoints/offline_mode
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
+  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+- label: Entrypoints Integration Test (API Server) # 100min
+  timeout_in_minutes: 130
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/openai
+  - tests/entrypoints/test_chat_utils
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+  - pytest -v -s entrypoints/test_chat_utils.py
+- label: Entrypoints Integration Test (Pooling)
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/pooling
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/pooling
+- label: Distributed Tests (4 GPUs) # 35min
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - tests/distributed/test_utils
+  - tests/distributed/test_pynccl
+  - tests/distributed/test_events
+  - tests/compile/fullgraph/test_basic_correctness.py
+  - examples/offline_inference/rlhf.py
+  - examples/offline_inference/rlhf_colocate.py
+  - tests/examples/offline_inference/data_parallel.py
+  - tests/v1/distributed
+  - tests/v1/engine/test_engine_core_client.py
+  - tests/distributed/test_symm_mem_allreduce.py
+  commands:
+  # test with torchrun tp=2 and external_dp=2
+  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with torchrun tp=2 and pp=2
+  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with torchrun tp=4 and dp=1
+  - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=2, pp=2 and dp=1
+  - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=1 and dp=4 with ep
+  - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with torchrun tp=2 and dp=2 with ep
+  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+  # test with internal dp
+  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
+  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
+  - pytest -v -s distributed/test_utils.py
+  - pytest -v -s compile/fullgraph/test_basic_correctness.py
+  - pytest -v -s distributed/test_pynccl.py
+  - pytest -v -s distributed/test_events.py
+  - pytest -v -s distributed/test_symm_mem_allreduce.py
+  # TODO: create a dedicated test section for multi-GPU example tests
+  # when we have multiple distributed example tests
+  - pushd ../examples/offline_inference
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+  - popd
+- label: Distributed Tests (8 GPUs) # 4min
+  timeout_in_minutes: 10
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_8
+  # grade: Blocking
+  gpu: h100
+  num_gpus: 8
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - examples/offline_inference/torchrun_dp_example.py
+  - vllm/config/parallel.py
+  - vllm/distributed/
+  - vllm/v1/engine/llm_engine.py
+  - vllm/v1/executor/uniproc_executor.py
+  - vllm/v1/worker/gpu_worker.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  #- export NCCL_CUMEM_HOST_ENABLE=0
+  # test with torchrun tp=2 and dp=4 with ep
+  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+- label: EPLB Algorithm Test # 5min
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  agent_pool: mi325_1
+  grade: Blocking
+  timeout_in_minutes: 15
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/distributed/eplb
+  - tests/distributed/test_eplb_algo.py
+  commands:
+  - pytest -v -s distributed/test_eplb_algo.py
+- label: EPLB Execution Test # 10min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/eplb
+  - tests/distributed/test_eplb_execute.py
+  commands:
+  - pytest -v -s distributed/test_eplb_execute.py
+  - pytest -v -s distributed/test_eplb_spec_decode.py
+- label: Metrics, Tracing Test # 12min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_2
+  # grade: Blocking
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/
+  - tests/v1/tracing
+  commands:
+  - "pip install \
+      'opentelemetry-sdk>=1.26.0' \
+      'opentelemetry-api>=1.26.0' \
+      'opentelemetry-exporter-otlp>=1.26.0' \
+      'opentelemetry-semantic-conventions-ai>=0.4.1'"
+  - pytest -v -s v1/tracing
+##### fast check tests  #####
+#####  1 GPU test  #####
+- label: Regression Test # 7min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  agent_pool: mi325_1
+  grade: Blocking
+  source_file_dependencies:
+  - vllm/
+  - tests/test_regression
+  commands:
+  - pip install modelscope
+  - pytest -v -s test_regression.py
+  working_dir: "/vllm-workspace/tests" # optional
+- label: Engine Test # 9min
+  timeout_in_minutes: 15
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - vllm/
+  - tests/engine
+  - tests/test_sequence
+  - tests/test_config
+  - tests/test_logger
+  - tests/test_vllm_port
+  commands:
+  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
+- label: V1 Test e2e + engine # 30min
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    # TODO: accuracy does not match, whether setting
+    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+    - pytest -v -s v1/e2e
+    - pytest -v -s v1/engine
+- label: V1 Test entrypoints # 35min
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  agent_pool: mi325_1
+  grade: Blocking
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    - pytest -v -s v1/entrypoints
+- label: V1 Test others # 42min
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    # split the test to avoid interference
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - pytest -v -s -m 'not cpu_test' v1/core
+    - pytest -v -s v1/executor
+    - pytest -v -s v1/kv_offload
+    - pytest -v -s v1/sample
+    - pytest -v -s v1/logits_processors
+    - pytest -v -s v1/worker
+    - pytest -v -s v1/spec_decode
+    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'not cpu_test' v1/metrics
+    - pytest -v -s v1/test_oracle.py
+    - pytest -v -s v1/test_request.py
+    - pytest -v -s v1/test_outputs.py
+    # Integration test for streaming correctness (requires special branch).
+    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
+    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+# TODO: Add the "V1 Test attetion (MI300)" test group
+- label: V1 Test attention (H100) # 10min
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  timeout_in_minutes: 30
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - pytest -v -s v1/attention
+- label: Batch Invariance Tests (H100) # 10min
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  timeout_in_minutes: 25
+  gpu: h100
+  source_file_dependencies:
+    - vllm/
+    - tests/v1/determinism/
+  commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pip install pytest-timeout pytest-forked
+    - pytest -v -s v1/determinism/test_batch_invariance.py
+    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+- label: V1 Test attention (B200) # 10min
+  timeout_in_minutes: 30
+  gpu: b200
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
+- label: V1 Test others (CPU) # 5 mins
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  agent_pool: mi325_1
+  grade: Blocking
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  no_gpu: true
+  commands:
+    # split the test to avoid interference
+    - pytest -v -s -m 'cpu_test' v1/core
+    - pytest -v -s v1/structured_output
+    - pytest -v -s v1/test_serial_utils.py
+    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'cpu_test' v1/metrics
+- label: Examples Test # 30min
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  working_dir: "/vllm-workspace/examples"
+  source_file_dependencies:
+  - vllm/entrypoints
+  - examples/
+  commands:
+    - pip install tensorizer # for tensorizer test
+    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 offline_inference/basic/chat.py
+    - python3 offline_inference/prefix_caching.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 offline_inference/audio_language.py --seed 0
+    - python3 offline_inference/vision_language.py --seed 0
+    - python3 offline_inference/vision_language_pooling.py --seed 0
+    - python3 offline_inference/vision_language_multi_image.py --seed 0
+    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+    - python3 offline_inference/basic/classify.py
+    - python3 offline_inference/basic/embed.py
+    - python3 offline_inference/basic/score.py
+    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
+    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+    #- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+- label: Platform Tests (CUDA) # 4min
+  timeout_in_minutes: 15
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - vllm/
+  - tests/cuda
+  commands:
+    - pytest -v -s cuda/test_cuda_context.py
+- label: Samplers Test # 56min
+  timeout_in_minutes: 75
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - vllm/model_executor/layers
+  - vllm/sampling_metadata.py
+  - tests/samplers
+  - tests/conftest.py
+  commands:
+    - pytest -v -s samplers
+    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
+- label: LoRA Test %N # 20min each
+  timeout_in_minutes: 30
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_8
+  # grade: Blocking
+  source_file_dependencies:
+  - vllm/lora
+  - tests/lora
+  commands:
+    - pytest -v -s lora \
+      --shard-id=$$BUILDKITE_PARALLEL_JOB \
+      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+      --ignore=lora/test_chatglm3_tp.py \
+      --ignore=lora/test_llama_tp.py \
+      --ignore=lora/test_llm_with_multi_loras.py \
+      --ignore=lora/test_olmoe_tp.py \
+      --ignore=lora/test_deepseekv2_tp.py \
+      --ignore=lora/test_gptoss_tp.py \
+      --ignore=lora/test_qwen3moe_tp.py
+  parallelism: 4
+- label: PyTorch Compilation Unit Tests # 15min
+  timeout_in_minutes: 30
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  torch_nightly: true
+  source_file_dependencies:
+    - vllm/
+    - tests/compile
+  commands:
+  # Run unit tests defined directly under compile/,
+  # not including subdirectories, which are usually heavier
+  # tests covered elsewhere.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
+- label: PyTorch Fullgraph Smoke Test # 15min
+  timeout_in_minutes: 30
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  # Run smoke tests under fullgraph directory, except test_full_graph.py
+  # as it is a heavy test that is covered in other steps.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
+- label: PyTorch Fullgraph Test # 27min
+  timeout_in_minutes: 40
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+    # Limit to no custom ops to reduce running time
+    # Wrap with quotes to escape yaml and avoid starting -k string with a -
+  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+- label: Cudagraph test
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  source_file_dependencies:
+  - tests/v1/cudagraph
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/config/compilation.py
+  - vllm/compilation
+  commands:
+    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
+- label: Kernels Core Operation Test # 48min
+  timeout_in_minutes: 75
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - csrc/
+  - tests/kernels/core
+  - tests/kernels/test_top_k_per_row.py
+  commands:
+    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+- label: Kernels Attention Test %N # 23min
+  timeout_in_minutes: 35
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_8
+  # grade: Blocking
+  source_file_dependencies:
+  - csrc/attention/
+  - vllm/attention
+  - vllm/v1/attention
+  - tests/kernels/attention
+  commands:
+    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+- label: Kernels Quantization Test %N # 64min
+  timeout_in_minutes: 90
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_8
+  # grade: Blocking
+  source_file_dependencies:
+  - csrc/quantization/
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization
+  commands:
+    - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+- label: Kernels MoE Test %N # 40min
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_8
+  # grade: Blocking
+  source_file_dependencies:
+  - csrc/quantization/cutlass_w8a8/moe/
+  - csrc/moe/
+  - tests/kernels/moe
+  - vllm/model_executor/layers/fused_moe/
+  - vllm/distributed/device_communicators/
+  - vllm/envs.py
+  - vllm/config
+  commands:
+    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+- label: Kernels Mamba Test # 31min
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - csrc/mamba/
+  - tests/kernels/mamba
+  - vllm/model_executor/layers/mamba/ops
+  commands:
+    - pytest -v -s kernels/mamba
+- label: Kernels DeepGEMM Test (H100) # Nvidia-centric
+# Not replicating for CUTLAS & CuTe
+  timeout_in_minutes: 45
+  gpu: h100
+  num_gpus: 1
+  source_file_dependencies:
+  - tools/install_deepgemm.sh
+  - vllm/utils/deep_gemm.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization/test_block_fp8.py
+  - tests/kernels/moe/test_deepgemm.py
+  - tests/kernels/moe/test_batched_deepgemm.py
+  - tests/kernels/attention/test_deepgemm_attention.py
+  commands:
+    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s kernels/moe/test_deepgemm.py
+    - pytest -v -s kernels/moe/test_batched_deepgemm.py
+    - pytest -v -s kernels/attention/test_deepgemm_attention.py
+- label: Model Executor Test # 23min
+  timeout_in_minutes: 35
+  torch_nightly: true
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - vllm/engine/arg_utils.py
+  - vllm/config/model.py
+  - vllm/model_executor
+  - tests/model_executor
+  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
+  commands:
+    - apt-get update && apt-get install -y curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s model_executor
+    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
+- label: Benchmarks # 11min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_8
+  # grade: Blocking
+  working_dir: "/vllm-workspace/.buildkite"
+  source_file_dependencies:
+  - benchmarks/
+  commands:
+  - bash scripts/run-benchmarks.sh
+- label: Benchmarks CLI Test # 7min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_8
+  # grade: Blocking
+  source_file_dependencies:
+  - vllm/
+  - tests/benchmarks/
+  commands:
+  - pytest -v -s benchmarks/
+- label: Quantization Test # 70min
+  timeout_in_minutes: 90
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  - tests/quantization
+  commands:
+  # temporary install here since we need nightly, will move to requirements/test.in
+  # after torchao 0.12 release, and pin a working version of torchao nightly here
+  # since torchao nightly is only compatible with torch nightly currently
+  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+  # we can only upgrade after this is resolved
+  # TODO(jerryzh168): resolve the above comment
+  - uv pip install --system torchao==0.13.0
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
+- label: LM Eval Small Models # 15min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+- label: OpenAI API correctness # 10min
+  timeout_in_minutes: 15
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - csrc/
+  - vllm/entrypoints/openai/
+  - vllm/model_executor/models/whisper.py
+  commands: # LMEval
+  # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
+  - pytest -s entrypoints/openai/correctness/
+- label: OpenAI-Compatible Tool Use # 23 min
+  timeout_in_minutes: 35
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  fast_check: false
+  source_file_dependencies:
+    - vllm/
+    - tests/tool_use
+  commands:
+    - pytest -v -s -m 'not cpu_test' tool_use
+- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  timeout_in_minutes: 10
+  source_file_dependencies:
+    - vllm/
+    - tests/tool_use
+  no_gpu: true
+  commands:
+    - pytest -v -s -m 'cpu_test' tool_use
+#####  models test  #####
+- label: Basic Models Tests (Initialization)
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_initialization.py
+  commands:
+    # Run a subset of model initialization tests
+    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
+- label: Basic Models Tests (Extra Initialization) %N
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_8
+  # grade: Blocking
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/model_executor/models/
+  - vllm/transformers_utils/
+  - tests/models/test_initialization.py
+  commands:
+    # Only when vLLM model source is modified - test initialization of a large
+    # subset of supported models (the complement of the small subset in the above
+    # test.) Also run if model initialization test file is modified
+    - pytest -v -s models/test_initialization.py \
+             -k 'not test_can_initialize_small_subset' \
+             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+             --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+- label: Basic Models Tests (Other)
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_transformers.py
+  - tests/models/test_registry.py
+  commands:
+    - pytest -v -s models/test_transformers.py models/test_registry.py
+- label: Basic Models Test (Other CPU) # 5min
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  timeout_in_minutes: 10
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/test_utils.py
+  - tests/models/test_vision.py
+  no_gpu: true
+  commands:
+    - pytest -v -s models/test_utils.py models/test_vision.py
+- label: Language Models Tests (Standard)
+  timeout_in_minutes: 25
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language
+  commands:
+    # Test standard language models, excluding a subset of slow tests
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/language -m 'core_model and (not slow_test)'
+- label: Language Models Tests (Extra Standard) %N
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_8
+  # grade: Blocking
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/model_executor/models/
+  - tests/models/language/pooling/test_embedding.py
+  - tests/models/language/generation/test_common.py
+  - tests/models/language/pooling/test_classification.py
+  commands:
+    # Shard slow subset of standard language models tests. Only run when model
+    # source is modified, or when specified test files are modified
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/language -m 'core_model and slow_test' \
+             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+             --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+- label: Language Models Tests (Hybrid) %N
+  timeout_in_minutes: 75
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_8
+  # grade: Blocking
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation
+  commands:
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+    # Shard hybrid language model tests
+    - pytest -v -s models/language/generation \
+                   -m hybrid_model \
+                   --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+                   --shard-id=$$BUILDKITE_PARALLEL_JOB
+  parallelism: 2
+- label: Language Models Test (Extended Generation) # 80min
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation
+  commands:
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+- label: Language Models Test (PPL)
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/generation_ppl_test
+  commands:
+    - pytest -v -s models/language/generation_ppl_test
+- label: Language Models Test (Extended Pooling)  # 36min
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/pooling
+  commands:
+    - pytest -v -s models/language/pooling -m 'not core_model'
+- label: Language Models Test (MTEB)
+  timeout_in_minutes: 110
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/pooling_mteb_test
+  commands:
+    - pytest -v -s models/language/pooling_mteb_test
+- label: Multi-Modal Processor Test # 44min
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing
+- label: Multi-Modal Models Test (Standard) # 60min
+  timeout_in_minutes: 80
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pip freeze | grep -E 'torch'
+    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
+- label: Multi-Modal Accuracy Eval (Small Models) # 10min
+  timeout_in_minutes: 70
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - vllm/multimodal/
+  - vllm/inputs/
+  - vllm/v1/core/
+  commands:
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+- label: Multi-Modal Models Test (Extended) 1
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
+- label: Multi-Modal Models Test (Extended) 2
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+- label: Multi-Modal Models Test (Extended) 3
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+- label: Quantized Models Test # 45 min
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  source_file_dependencies:
+  - vllm/model_executor/layers/quantization
+  - tests/models/quantization
+  commands:
+    - pytest -v -s models/quantization
+# This test is used only in PR development phase to test individual models and should never run on main
+- label: Custom Models Test
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  # grade: Blocking
+  optional: true
+  commands:
+    - echo 'Testing custom models...'
+    # PR authors can temporarily add commands below to test individual models
+    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
+    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
+- label: Transformers Nightly Models Test
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  working_dir: "/vllm-workspace/"
+  optional: true
+  commands:
+    - pip install --upgrade git+https://github.com/huggingface/transformers
+    - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
+    - pytest -v -s tests/models/test_transformers.py
+    # - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
+    - python3 examples/offline_inference/basic/chat.py
+    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    # Whisper needs spawn method to avoid deadlock
+    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
+- label: Blackwell Test # 21 min
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  # optional: true
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - csrc/attention/mla/
+  - csrc/quantization/cutlass_w8a8/moe/
+  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/attention/backends/mla/cutlass_mla.py
+  - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/platforms/cuda.py
+  - vllm/attention/selector.py
+  commands:
+    - nvidia-smi
+    - python3 examples/offline_inference/basic/chat.py
+    # Attention
+    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+    - pytest -v -s tests/kernels/attention/test_attention_selector.py
+    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
+    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
+    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+    - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
+    # Quantization
+    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+    - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
+    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
+    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+    - pytest -v -s tests/kernels/moe/test_flashinfer.py
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
+- label: Blackwell Fusion and Compile Tests # 30 min
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - vllm/model_executor/layers/fused_moe/layer.py
+  - tests/compile/test_fusion_attn.py
+  - tests/compile/test_silu_mul_quant_fusion.py
+  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/fullgraph/test_full_graph.py
+  commands:
+    - nvidia-smi
+    - pytest -v -s tests/compile/test_fusion_attn.py
+    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    # this runner has 2 GPUs available even though num_gpus=2 is not set
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # Wrap with quotes to escape yaml
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+- label: Blackwell Fusion E2E Tests # 30 min
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/fullgraph/test_full_graph.py
+  commands:
+    - nvidia-smi
+    # Run all e2e fusion tests
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
+- label: ROCm GPT-OSS Eval
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  agent_pool: mi325_1
+  mirror_hardwares: [amdexperimental, amdproduction]
+  optional: true # run on nightlies
+  source_file_dependencies:
+  - tests/evals/gpt_oss
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - uv pip install --system 'gpt-oss[eval]==0.0.5'
+    - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+- label: Blackwell Quantized MoE Test
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - tests/quantization/test_blackwell_moe.py
+  - vllm/model_executor/models/deepseek_v2.py
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/models/llama4.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization/compressed_tensors
+  - vllm/model_executor/layers/quantization/modelopt.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - pytest -s -v tests/quantization/test_blackwell_moe.py
+- label: Blackwell LM Eval Small Models
+  timeout_in_minutes: 120
+  gpu: b200
+  optional: true # run on nightlies
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+#####  1 GPU test  #####
+#####  multi gpus test  #####
+- label: Distributed Comm Ops Test # 7min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_2
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/distributed
+  - tests/distributed
+  commands:
+  - pytest -v -s distributed/test_comm_ops.py
+  - pytest -v -s distributed/test_shm_broadcast.py
+  - pytest -v -s distributed/test_shm_buffer.py
+  - pytest -v -s distributed/test_shm_storage.py
+- label: 2 Node Tests (4 GPUs in total) # 16min
+  timeout_in_minutes: 30
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  num_nodes: 2
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
+  - tests/examples/offline_inference/data_parallel.py
+  commands:
+  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
+    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
+    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
+    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
+  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
+    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
+    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+- label: Distributed Tests (2 GPUs) # 68min
+  timeout_in_minutes: 90
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_2
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/compilation/
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/worker/worker_base.py
+  - vllm/v1/engine/
+  - vllm/v1/worker/
+  - tests/compile/fullgraph/test_basic_correctness.py
+  - tests/compile/test_wrapper.py
+  - tests/distributed/
+  - tests/entrypoints/llm/test_collective_rpc.py
+  - tests/v1/distributed
+  - tests/v1/entrypoints/openai/test_multi_api_servers.py
+  - tests/v1/shutdown
+  - tests/v1/worker/test_worker_memory_snapshot.py
+  commands:
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
+  - pytest -v -s entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
+  - pytest -v -s ./compile/test_wrapper.py
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - pytest -v -s distributed/test_sequence_parallel.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
+- label: Distributed Model Tests (2 GPUs) # 37min
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_2
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/model_executor/model_loader/sharded_state_loader.py
+  - vllm/model_executor/models/
+  - tests/basic_correctness/
+  - tests/model_executor/model_loader/test_sharded_state_loader.py
+  - tests/models/
+  commands:
+  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
+  # Avoid importing model tests that cause CUDA reinitialization error
+  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
+- label: Plugin Tests (2 GPUs) # 40min
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_2
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/plugins/
+  - tests/plugins/
+  commands:
+  # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
+  - pip install -e ./plugins/vllm_add_dummy_platform
+  - pytest -v -s plugins_tests/test_platform_plugins.py
+  - pip uninstall vllm_add_dummy_platform -y
+  # end platform plugin tests
+  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+  - pip install -e ./plugins/prithvi_io_processor_plugin
+  - pytest -v -s plugins_tests/test_io_processor_plugins.py
+  - pip uninstall prithvi_io_processor_plugin -y
+  # end io_processor plugins test
+  # begin stat_logger plugins test
+  - pip install -e ./plugins/vllm_add_dummy_stat_logger
+  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
+  - pip uninstall dummy_stat_logger -y
+  # end stat_logger plugins test
+  # other tests continue here:
+  - pytest -v -s plugins_tests/test_scheduler_plugins.py
+  - pip install -e ./plugins/vllm_add_dummy_model
+  - pytest -v -s distributed/test_distributed_oot.py
+  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+  - pytest -v -s models/test_oot_registration.py # it needs a clean process
+  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
+- label: Pipeline + Context Parallelism Test # 45min
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
+  commands:
+  - pytest -v -s distributed/test_pp_cudagraph.py
+  - pytest -v -s distributed/test_pipeline_parallel.py
+- label: LoRA TP Test (Distributed) # 17 min
+  timeout_in_minutes: 30
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/lora
+  - tests/lora
+  commands:
+    # FIXIT: find out which code initialize cuda before running the test
+    # before the fix, we need to use spawn to test it
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    # There is some Tensor Parallelism related processing logic in LoRA that
+    # requires multi-GPU testing for validation.
+    - pytest -v -s -x lora/test_chatglm3_tp.py
+    - pytest -v -s -x lora/test_llama_tp.py
+    - pytest -v -s -x lora/test_llm_with_multi_loras.py
+    - pytest -v -s -x lora/test_olmoe_tp.py
+    # Disabled for now because MXFP4 backend on non-cuda platform 
+    # doesn't support LoRA yet
+    #- pytest -v -s -x lora/test_gptoss_tp.py
+- label: Weight Loading Multiple GPU Test  # 33min
+  timeout_in_minutes: 45
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_2
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt
+- label: Weight Loading Multiple GPU Test - Large Models # optional
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_2
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
+- label: NixlConnector PD accuracy tests (Distributed) # 30min
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+##### multi gpus test #####
+##### A100 test #####
+- label: Distributed Tests (A100) # optional
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  gpu: a100
+  optional: true
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/
+  commands:
+  # NOTE: don't test llama model here, it seems hf implementation is buggy
+  # see https://github.com/vllm-project/vllm/pull/5689 for details
+  - pytest -v -s distributed/test_custom_all_reduce.py
+  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
+  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+  - pytest -v -s -x lora/test_mixtral.py
+- label: LM Eval Large Models # optional
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
+  gpu: a100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+##### H100 test #####
+- label: LM Eval Large Models (H100) # optional
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+##### H200 test #####
+- label: Distributed Tests (H200) # optional
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_2
+  # grade: Blocking
+  gpu: h200
+  optional: true
+  working_dir: "/vllm-workspace/"
+  num_gpus: 2
+  commands:
+    - pytest -v -s tests/compile/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    - pytest -v -s tests/distributed/test_sequence_parallel.py
+    - pytest -v -s tests/distributed/test_context_parallel.py
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - pytest -v -s tests/v1/distributed/test_dbo.py
+##### B200 test #####
+- label: Distributed Tests (B200) # optional
+  gpu: b200
+  optional: true
+  working_dir: "/vllm-workspace/"
+  num_gpus: 2
+  commands:
+    - pytest -v -s tests/distributed/test_context_parallel.py
+    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+    - pytest -v -s tests/v1/distributed/test_dbo.py
+##### RL Integration Tests #####
+- label: Prime-RL Integration Test # 15min
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_2
+  # grade: Blocking
+  timeout_in_minutes: 30
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  source_file_dependencies:
+  - vllm/
+  - .buildkite/scripts/run-prime-rl-test.sh
+  commands:
+    - bash .buildkite/scripts/run-prime-rl-test.sh
+- label: DeepSeek V2-Lite Accuracy
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
+- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+  timeout_in_minutes: 60
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -25,6 +25,7 @@
 #     and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
 # working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
 # source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
+# autorun_on_main (bool): default to false, if true, the test will run automatically when commit is pushed to main branch.
 # When adding a test
 # - If the test belongs to an existing group, add it there
@@ -38,7 +39,7 @@ steps:
 - label: Pytorch Nightly Dependency Override Check # 2min
  # if this test fails, it means the nightly torch version is not compatible with some
  # of the dependencies. Please check the error message and add the package to whitelist
-  # in /vllm/tools/generate_nightly_torch_test.py
+  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
  soft_fail: true
  source_file_dependencies:
  - requirements/nightly_torch_test.txt
@@ -50,19 +51,32 @@ steps:
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
+  - tests/multimodal
+  - tests/utils_
+  commands:
+  - pytest -v -s -m 'not cpu_test' multimodal
+  - pytest -v -s utils_
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
+  timeout_in_minutes: 20
+  source_file_dependencies:
+  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/multimodal
-  - tests/utils_
  - tests/standalone_tests/lazy_imports.py
+  - tests/tokenizers_
  - tests/transformers_utils
+  - tests/config
+  no_gpu: true
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
-  - pytest -v -s multimodal
+  - pytest -v -s -m 'cpu_test' multimodal
-  - pytest -v -s utils_ # Utils
+  - pytest -v -s tokenizers_
-  - pytest -v -s transformers_utils # transformers_utils
+  - pytest -v -s transformers_utils
+  - pytest -v -s config
 - label: Python-only Installation Test # 10min
  timeout_in_minutes: 20
@@ -155,17 +169,16 @@ steps:
  - tests/distributed/test_utils
  - tests/distributed/test_pynccl
  - tests/distributed/test_events
-  - tests/compile/test_basic_correctness
+  - tests/compile/fullgraph/test_basic_correctness.py
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
  - tests/examples/offline_inference/data_parallel.py
-  - tests/v1/test_async_llm_dp.py
+  - tests/v1/distributed
-  - tests/v1/test_external_lb_dp.py
-  - tests/v1/test_internal_lb_dp.py
-  - tests/v1/test_hybrid_lb_dp.py
  - tests/v1/engine/test_engine_core_client.py
  - tests/distributed/test_symm_mem_allreduce.py
  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
  # test with torchrun tp=2 and external_dp=2
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with torchrun tp=2 and pp=2
@@ -180,13 +193,14 @@ steps:
  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  # test with internal dp
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
  - pytest -v -s distributed/test_utils.py
-  - pytest -v -s compile/test_basic_correctness.py
+  - pytest -v -s compile/fullgraph/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
  - pytest -v -s distributed/test_symm_mem_allreduce.py
@@ -197,6 +211,24 @@ steps:
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd
+- label: Distributed Tests (8 GPUs) # 4min
+  timeout_in_minutes: 10
+  gpu: h100
+  num_gpus: 8
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - examples/offline_inference/torchrun_dp_example.py
+  - vllm/config/parallel.py
+  - vllm/distributed/
+  - vllm/v1/engine/llm_engine.py
+  - vllm/v1/executor/uniproc_executor.py
+  - vllm/v1/worker/gpu_worker.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  # test with torchrun tp=2 and dp=4 with ep
+  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
 - label: EPLB Algorithm Test # 5min
  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/tests"
@@ -206,8 +238,8 @@ steps:
  commands:
  - pytest -v -s distributed/test_eplb_algo.py
- label: EPLB Execution Test # 5min
+- label: EPLB Execution Test # 10min
-  timeout_in_minutes: 15
+  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
@@ -215,6 +247,7 @@ steps:
  - tests/distributed/test_eplb_execute.py
  commands:
  - pytest -v -s distributed/test_eplb_execute.py
+  - pytest -v -s distributed/test_eplb_spec_decode.py
 - label: Metrics, Tracing Test # 12min
  timeout_in_minutes: 20
@@ -245,21 +278,18 @@ steps:
  - pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional
- label: Engine Test # 25min
+- label: Engine Test # 9min
-  timeout_in_minutes: 40
+  timeout_in_minutes: 15
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
  - tests/engine
-  - tests/tokenization
  - tests/test_sequence
  - tests/test_config
  - tests/test_logger
  - tests/test_vllm_port
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
-  # OOM in the CI unless we run this separately
-  - pytest -v -s tokenization
 - label: V1 Test e2e + engine # 30min
  timeout_in_minutes: 45
@@ -289,27 +319,68 @@ steps:
    - vllm/
    - tests/v1
  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
    # split the test to avoid interference
-    - pytest -v -s v1/core
+    - pytest -v -s -m 'not cpu_test' v1/core
    - pytest -v -s v1/executor
    - pytest -v -s v1/kv_offload
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
-    - pytest -v -s v1/structured_output
    - pytest -v -s v1/spec_decode
-    - pytest -v -s v1/kv_connector/unit
+    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
-    - pytest -v -s v1/metrics
+    - pytest -v -s -m 'not cpu_test' v1/metrics
-    - pytest -v -s v1/test_kv_sharing.py
-    - pytest -v -s v1/test_metrics_reader.py
    - pytest -v -s v1/test_oracle.py
    - pytest -v -s v1/test_request.py
-    - pytest -v -s v1/test_serial_utils.py
+    - pytest -v -s v1/test_outputs.py
-    - pytest -v -s v1/test_utils.py
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+- label: V1 Test attention (H100) # 10min
+  timeout_in_minutes: 30
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - pytest -v -s v1/attention
+- label: Batch Invariance Tests (H100) # 10min
+  timeout_in_minutes: 25
+  gpu: h100
+  source_file_dependencies:
+    - vllm/
+    - tests/v1/determinism/
+  commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pip install pytest-timeout pytest-forked
+    - pytest -v -s v1/determinism/test_batch_invariance.py
+    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+- label: V1 Test attention (B200) # 10min
+  timeout_in_minutes: 30
+  gpu: b200
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
+- label: V1 Test others (CPU) # 5 mins
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  no_gpu: true
+  commands:
+    # split the test to avoid interference
+    - pytest -v -s -m 'cpu_test' v1/core
+    - pytest -v -s v1/structured_output
+    - pytest -v -s v1/test_serial_utils.py
+    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
+    - pytest -v -s -m 'cpu_test' v1/metrics
 - label: Examples Test # 30min
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
@@ -334,7 +405,8 @@ steps:
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
-    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
+    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
 - label: Platform Tests (CUDA) # 4min
  timeout_in_minutes: 15
@@ -369,7 +441,12 @@ steps:
      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
      --ignore=lora/test_chatglm3_tp.py \
      --ignore=lora/test_llama_tp.py \
-      --ignore=lora/test_llm_with_multi_loras.py
+      --ignore=lora/test_llm_with_multi_loras.py \
+      --ignore=lora/test_olmoe_tp.py \
+      --ignore=lora/test_deepseekv2_tp.py \
+      --ignore=lora/test_gptoss_tp.py \
+      --ignore=lora/test_qwen3moe_tp.py
  parallelism: 4
 - label: PyTorch Compilation Unit Tests # 15min
@@ -380,15 +457,12 @@ steps:
    - vllm/
    - tests/compile
  commands:
-    - pytest -v -s compile/test_pass_manager.py
+  # Run unit tests defined directly under compile/,
-    - pytest -v -s compile/test_fusion.py
+  # not including subdirectories, which are usually heavier
-    - pytest -v -s compile/test_fusion_attn.py
+  # tests covered elsewhere.
-    - pytest -v -s compile/test_silu_mul_quant_fusion.py
+  # Use `find` to launch multiple instances of pytest so that
-    - pytest -v -s compile/test_sequence_parallelism.py
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
-    - pytest -v -s compile/test_async_tp.py
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
-    - pytest -v -s compile/test_fusion_all_reduce.py
-    - pytest -v -s compile/test_decorator.py
-    - pytest -v -s compile/test_noop_elimination.py
 - label: PyTorch Fullgraph Smoke Test # 15min
  timeout_in_minutes: 30
@@ -398,18 +472,37 @@ steps:
  - vllm/
  - tests/compile
  commands:
-  - pytest -v -s compile/test_basic_correctness.py
+  # Run smoke tests under fullgraph directory, except test_full_graph.py
-  - pytest -v -s compile/piecewise/
+  # as it is a heavy test that is covered in other steps.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
- label: PyTorch Fullgraph Test # 20min
+- label: PyTorch Fullgraph Test # 27min
-  timeout_in_minutes: 30
+  timeout_in_minutes: 40
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
-  - pytest -v -s compile/test_full_graph.py
+    # fp8 kv scales not supported on sm89, tested on Blackwell instead
+  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+    # Limit to no custom ops to reduce running time
+    # Wrap with quotes to escape yaml and avoid starting -k string with a -
+  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+- label: Cudagraph test
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - tests/v1/cudagraph
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/config/compilation.py
+  - vllm/compilation
+  commands:
+    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
 - label: Kernels Core Operation Test # 48min
  timeout_in_minutes: 75
@@ -417,8 +510,9 @@ steps:
  source_file_dependencies:
  - csrc/
  - tests/kernels/core
+  - tests/kernels/test_top_k_per_row.py
  commands:
-    - pytest -v -s kernels/core
+    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
 - label: Kernels Attention Test %N # 23min
  timeout_in_minutes: 35
@@ -452,6 +546,8 @@ steps:
  - tests/kernels/moe
  - vllm/model_executor/layers/fused_moe/
  - vllm/distributed/device_communicators/
+  - vllm/envs.py
+  - vllm/config
  commands:
    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2
@@ -462,32 +558,44 @@ steps:
  source_file_dependencies:
  - csrc/mamba/
  - tests/kernels/mamba
+  - vllm/model_executor/layers/mamba/ops
  commands:
    - pytest -v -s kernels/mamba
- label: Tensorizer Test # 14min
+- label: Kernels DeepGEMM Test (H100)
-  timeout_in_minutes: 25
+  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental]
+  gpu: h100
+  num_gpus: 1
  source_file_dependencies:
-  - vllm/model_executor/model_loader
+  - tools/install_deepgemm.sh
-  - tests/tensorizer_loader
+  - vllm/utils/deep_gemm.py
-  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization/test_block_fp8.py
+  - tests/kernels/moe/test_deepgemm.py
+  - tests/kernels/moe/test_batched_deepgemm.py
+  - tests/kernels/attention/test_deepgemm_attention.py
  commands:
-    - apt-get update && apt-get install -y curl libsodium23
+    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s kernels/moe/test_deepgemm.py
-    - pytest -v -s tensorizer_loader
+    - pytest -v -s kernels/moe/test_batched_deepgemm.py
-    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
+    - pytest -v -s kernels/attention/test_deepgemm_attention.py
- label: Model Executor Test # 7min
+- label: Model Executor Test # 23min
-  timeout_in_minutes: 20
+  timeout_in_minutes: 35
+  torch_nightly: true
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
+  - vllm/engine/arg_utils.py
+  - vllm/config/model.py
  - vllm/model_executor
  - tests/model_executor
+  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
  commands:
    - apt-get update && apt-get install -y curl libsodium23
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s model_executor
+    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
 - label: Benchmarks # 11min
  timeout_in_minutes: 20
@@ -521,8 +629,10 @@ steps:
  # since torchao nightly is only compatible with torch nightly currently
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
-  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
+  # TODO(jerryzh168): resolve the above comment
-  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
+  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system conch-triton-kernels
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 - label: LM Eval Small Models # 53min
  timeout_in_minutes: 75
@@ -530,6 +640,7 @@ steps:
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
+  autorun_on_main: true
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
@@ -550,10 +661,17 @@ steps:
  source_file_dependencies:
    - vllm/
    - tests/tool_use
-    - tests/mistral_tool_use
  commands:
-    - pytest -v -s tool_use
+    - pytest -v -s -m 'not cpu_test' tool_use
-    - pytest -v -s mistral_tool_use
+- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
+  timeout_in_minutes: 10
+  source_file_dependencies:
+    - vllm/
+    - tests/tool_use
+  no_gpu: true
+  commands:
+    - pytest -v -s -m 'cpu_test' tool_use
 #####  models test  #####
@@ -574,6 +692,7 @@ steps:
  torch_nightly: true
  source_file_dependencies:
  - vllm/model_executor/models/
+  - vllm/transformers_utils/
  - tests/models/test_initialization.py
  commands:
    # Only when vLLM model source is modified - test initialization of a large
@@ -593,13 +712,19 @@ steps:
  - vllm/
  - tests/models/test_transformers.py
  - tests/models/test_registry.py
+  commands:
+    - pytest -v -s models/test_transformers.py models/test_registry.py
+- label: Basic Models Test (Other CPU) # 5min
+  timeout_in_minutes: 10
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
  - tests/models/test_utils.py
  - tests/models/test_vision.py
+  no_gpu: true
  commands:
-    - pytest -v -s models/test_transformers.py \
+    - pytest -v -s models/test_utils.py models/test_vision.py
-                   models/test_registry.py \
-                   models/test_utils.py \
-                   models/test_vision.py
 - label: Language Models Tests (Standard)
  timeout_in_minutes: 25
@@ -658,8 +783,10 @@ steps:
  - vllm/
  - tests/models/language/generation
  commands:
-    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
+    # Install fast path packages for testing against transformers
-    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
 - label: Language Models Test (PPL)
@@ -692,14 +819,24 @@ steps:
  commands:
    - pytest -v -s models/language/pooling_mteb_test
- label: Multi-Modal Processor Test # 44min
+- label: Multi-Modal Processor Test (CPU)
  timeout_in_minutes: 60
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
+  no_gpu: true
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing
+    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+- label: Multi-Modal Processor Test
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
 - label: Multi-Modal Models Test (Standard) # 60min
  timeout_in_minutes: 80
@@ -714,6 +851,16 @@ steps:
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
+- label: Multi-Modal Accuracy Eval (Small Models) # 50min
+  timeout_in_minutes: 70
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - vllm/multimodal/
+  - vllm/inputs/
+  - vllm/v1/core/
+  commands:
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
 - label: Multi-Modal Models Test (Extended) 1
  mirror_hardwares: [amdexperimental]
  optional: true
@@ -766,9 +913,11 @@ steps:
 - label: Transformers Nightly Models Test
  working_dir: "/vllm-workspace/"
  optional: true
+  soft_fail: true
  commands:
    - pip install --upgrade git+https://github.com/huggingface/transformers
    - pytest -v -s tests/models/test_initialization.py
+    - pytest -v -s tests/models/test_transformers.py
    - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py
    - python3 examples/offline_inference/basic/chat.py
@@ -776,8 +925,8 @@ steps:
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
- label: Blackwell Test # 38 min
+- label: Blackwell Test # 21 min
-  timeout_in_minutes: 60
+  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
  gpu: b200
  # optional: true
@@ -790,13 +939,16 @@ steps:
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/fusion.py
+  - vllm/v1/attention/backends/mla/cutlass_mla.py
-  - vllm/compilation/fusion_attn.py
+  - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/platforms/cuda.py
+  - vllm/attention/selector.py
  commands:
    - nvidia-smi
    - python3 examples/offline_inference/basic/chat.py
    # Attention
    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+    - pytest -v -s tests/kernels/attention/test_attention_selector.py
    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
@@ -808,19 +960,71 @@ steps:
    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
+    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
-    - pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
+    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
-    # Fusion
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
+- label: Blackwell Fusion and Compile Tests # 30 min
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusion_attn.py
+  - tests/compile/test_silu_mul_quant_fusion.py
+  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/fullgraph/test_full_graph.py
+  commands:
+    - nvidia-smi
+    - pytest -v -s tests/compile/test_fusion_attn.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    # this runner has 2 GPUs available even though num_gpus=2 is not set
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # Wrap with quotes to escape yaml
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+- label: Blackwell Fusion E2E Tests # 30 min
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  commands:
+    - nvidia-smi
+    # Run all e2e fusion tests
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
- label: GPT-OSS Eval (Blackwell)
+- label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  gpu: b200
-  optional: true # disable while debugging
+  optional: true # run on nightlies
  source_file_dependencies:
  - tests/evals/gpt_oss
  - vllm/model_executor/models/gpt_oss.py
@@ -828,7 +1032,34 @@ steps:
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2'
+    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+- label: Blackwell Quantized MoE Test
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - tests/quantization/test_blackwell_moe.py
+  - vllm/model_executor/models/deepseek_v2.py
+  - vllm/model_executor/models/gpt_oss.py
+  - vllm/model_executor/models/llama4.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization/compressed_tensors
+  - vllm/model_executor/layers/quantization/modelopt.py
+  - vllm/model_executor/layers/quantization/mxfp4.py
+  - vllm/v1/attention/backends/flashinfer.py
+  commands:
+    - pytest -s -v tests/quantization/test_blackwell_moe.py
+- label: Blackwell LM Eval Small Models
+  timeout_in_minutes: 120
+  gpu: b200
+  optional: true # run on nightlies
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
 #####  1 GPU test  #####
 #####  multi gpus test  #####
@@ -885,23 +1116,26 @@ steps:
  - vllm/worker/worker_base.py
  - vllm/v1/engine/
  - vllm/v1/worker/
-  - tests/compile/test_basic_correctness.py
+  - tests/compile/fullgraph/test_basic_correctness.py
  - tests/compile/test_wrapper.py
  - tests/distributed/
  - tests/entrypoints/llm/test_collective_rpc.py
-  - tests/v1/test_async_llm_dp.py
+  - tests/v1/distributed
-  - tests/v1/test_external_lb_dp.py
  - tests/v1/entrypoints/openai/test_multi_api_servers.py
  - tests/v1/shutdown
  - tests/v1/worker/test_worker_memory_snapshot.py
  commands:
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
+  # https://github.com/NVIDIA/nccl/issues/1838
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
-  - pytest -v -s ./compile/test_basic_correctness.py
+  - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - pytest -v -s distributed/test_sequence_parallel.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
@@ -945,6 +1179,11 @@ steps:
  - pytest -v -s plugins_tests/test_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
  # end io_processor plugins test
+  # begin stat_logger plugins test
+  - pip install -e ./plugins/vllm_add_dummy_stat_logger
+  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
+  - pip uninstall dummy_stat_logger -y
+  # end stat_logger plugins test
  # other tests continue here:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
@@ -984,6 +1223,8 @@ steps:
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
    - pytest -v -s -x lora/test_llm_with_multi_loras.py
+    - pytest -v -s -x lora/test_olmoe_tp.py
+    - pytest -v -s -x lora/test_gptoss_tp.py
 - label: Weight Loading Multiple GPU Test  # 33min
@@ -1010,6 +1251,17 @@ steps:
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+- label: NixlConnector PD accuracy tests (Distributed) # 30min
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
 ##### multi gpus test #####
 ##### A100 test #####
@@ -1040,15 +1292,34 @@ steps:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+##### H100 test #####
+- label: LM Eval Large Models (H100) # optional
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
 ##### H200 test #####
- label: Distrubted Tests (H200) # optional
+- label: Distributed Tests (H200) # optional
  gpu: h200
  optional: true
  working_dir: "/vllm-workspace/"
  num_gpus: 2
  commands:
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - pytest -v -s tests/v1/distributed/test_dbo.py
 ##### B200 test #####
 - label: Distributed Tests (B200) # optional
@@ -1059,6 +1330,7 @@ steps:
  commands:
    - pytest -v -s tests/distributed/test_context_parallel.py
    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+    - pytest -v -s tests/v1/distributed/test_dbo.py
 ##### RL Integration Tests #####
 - label: Prime-RL Integration Test # 15min
@@ -1071,3 +1343,30 @@ steps:
  - .buildkite/scripts/run-prime-rl-test.sh
  commands:
    - bash .buildkite/scripts/run-prime-rl-test.sh
+- label: DeepSeek V2-Lite Accuracy
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
+- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+  timeout_in_minutes: 60
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file
--- a/.coveragerc
+++ b/.coveragerc
 [run]
-source = vllm
+# Track the installed vllm package (this is what actually gets imported during tests)
+# Use wildcard pattern to match the installed location
+source =
+    vllm
+    */dist-packages/vllm
+    */site-packages/vllm
 omit =
    */tests/*
    */test_*
@@ -12,6 +17,16 @@ omit =
    */benchmarks/*
    */docs/*
+[paths]
+# Map all possible vllm locations to a canonical "vllm" path
+# This ensures coverage.combine properly merges data from different test runs
+source =
+    vllm
+    /vllm-workspace/src/vllm
+    /vllm-workspace/vllm
+    */site-packages/vllm
+    */dist-packages/vllm
 [report]
 exclude_lines =
    pragma: no cover

--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
+# Migrate from `yapf` & `isort` to `ruff`
+d6953beb91da4e9c99be4c0a1304a2d24189535c
+# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
+8fcaaf6a165e661f63fc51be906bc05b0767332f
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -3,17 +3,14 @@
 # This lists cover the "core" components of vLLM that require careful review
 /vllm/attention @LucasWilkinson
-/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
+/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
+/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
-/vllm/model_executor/layers/fused_moe @mgoin
+/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
-/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
-/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
-/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
+/vllm/model_executor/layers/batch_invariant.py @yewentao256 
-/vllm/v1/attention @LucasWilkinson
+/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
-/vllm/v1/sample @22quinn @houseroad
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm @chaunceyjiang
@@ -24,44 +21,61 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
-/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
+/vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
+/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
 # vLLM V1
-/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
+/vllm/v1/attention @LucasWilkinson
-/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
+/vllm/v1/attention/backends/mla @pavanimajety
-/vllm/v1/spec_decode @benchislett @luccafong
+/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
-/vllm/v1/attention/backends/flashinfer.py @mgoin
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
-/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
+/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
+/vllm/v1/sample @22quinn @houseroad @njhill
+/vllm/v1/spec_decode @benchislett @luccafong
+/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/kv_cache_interface.py @heheda12345
 /vllm/v1/offloading @ApostaC
+# Model runner V2
+/vllm/v1/worker/gpu @WoosukKwon
 # Test ownership
-/.buildkite/lm-eval-harness @mgoin @simon-mo
+/.buildkite/lm-eval-harness @mgoin 
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @aarnphm @NickLucche
 /tests/evals @mgoin
 /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche
-/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
+/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
-/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
+/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
 /tests/models/language/generation/test_hybrid.py @tdoublep
-/tests/v1/kv_connector/nixl_integration @NickLucche 
+/tests/v1/kv_connector/nixl_integration @NickLucche
 /tests/v1/kv_connector @ApostaC
 /tests/v1/offloading @ApostaC
+/tests/v1/determinism @yewentao256 
-# Transformers backend
+# Transformers modeling backend
-/vllm/model_executor/models/transformers.py @hmellor
+/vllm/model_executor/models/transformers @hmellor
 /tests/models/test_transformers.py @hmellor
+# Observability
+/vllm/config/observability.py @markmc
+/vllm/v1/metrics @markmc
+/tests/v1/metrics @markmc
+/vllm/tracing.py @markmc
+/tests/v1/tracing/test_tracing.py @markmc
+/vllm/config/kv_events.py @markmc
+/vllm/distributed/kv_events.py @markmc
+/tests/distributed/test_events.py @markmc
 # Docs
 /docs/mkdocs @hmellor
 /docs/**/*.yml @hmellor
@@ -106,11 +120,21 @@ mkdocs.yaml @hmellor
 /vllm/attention/ops/triton_unified_attention.py @tdoublep
 # ROCm related: specify owner with write access to notify AMD folks for careful code review
-/docker/Dockerfile.rocm* @gshtras
+/vllm/**/*rocm* @tjtanaa
-/vllm/v1/attention/backends/rocm*.py @gshtras
+/docker/Dockerfile.rocm* @gshtras @tjtanaa
-/vllm/v1/attention/backends/mla/rocm*.py @gshtras
+/vllm/v1/attention/backends/rocm*.py @gshtras @tjtanaa
-/vllm/attention/ops/rocm*.py @gshtras
+/vllm/v1/attention/backends/mla/rocm*.py @gshtras @tjtanaa
-/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
+/vllm/attention/ops/rocm*.py @gshtras @tjtanaa
+/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras @tjtanaa
+/csrc/rocm @gshtras @tjtanaa
+/requirements/*rocm* @tjtanaa
+/tests/**/*rocm* @tjtanaa
+/docs/**/*rocm* @tjtanaa
+/vllm/**/*quark* @tjtanaa
+/tests/**/*quark* @tjtanaa
+/docs/**/*quark* @tjtanaa
+/vllm/**/*aiter* @tjtanaa
+/tests/**/*aiter* @tjtanaa
 # TPU
 /vllm/v1/worker/tpu* @NickLucche
@@ -120,3 +144,17 @@ mkdocs.yaml @hmellor
 # KVConnector installation files
 /requirements/kv_connectors.txt @NickLucche
+# Pooling models
+/examples/*/pooling/ @noooop
+/tests/models/*/pooling* @noooop
+/tests/entrypoints/pooling @noooop
+/vllm/entrypoints/pooling @aarnphm @chaunceyjiang @noooop
+/vllm/config/pooler.py @noooop
+/vllm/pooling_params.py @noooop
+/vllm/model_executor/layers/pooler.py @noooop
+# Security guide and policies
+/docs/usage/security.md @russellb
+/SECURITY.md @russellb
+/docs/contributing/vulnerability_management.md @russellb
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -2,6 +2,7 @@ pull_request_rules:
 - name: label-documentation
  description: Automatically apply documentation label
  conditions:
+    - label != stale
    - or:
      - files~=^[^/]+\.md$
      - files~=^docs/
@@ -10,10 +11,13 @@ pull_request_rules:
    label:
      add:
        - documentation
+    comment:
+      message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
 - name: label-ci-build
  description: Automatically apply ci/build label
  conditions:
+    - label != stale
    - or:
      - files~=^\.github/
      - files~=\.buildkite/
@@ -30,6 +34,7 @@ pull_request_rules:
 - name: label-deepseek
  description: Automatically apply deepseek label
  conditions:
+    - label != stale
    - or:
      - files~=^examples/.*deepseek.*\.py
      - files~=^tests/.*deepseek.*\.py
@@ -46,6 +51,7 @@ pull_request_rules:
 - name: label-frontend
  description: Automatically apply frontend label
  conditions:
+    - label != stale
    - files~=^vllm/entrypoints/
  actions:
    label:
@@ -55,6 +61,7 @@ pull_request_rules:
 - name: label-llama
  description: Automatically apply llama label
  conditions:
+    - label != stale
    - or:
      - files~=^examples/.*llama.*\.py
      - files~=^tests/.*llama.*\.py
@@ -70,6 +77,7 @@ pull_request_rules:
 - name: label-multi-modality
  description: Automatically apply multi-modality label
  conditions:
+    - label != stale
    - or:
      - files~=^vllm/multimodal/
      - files~=^tests/multimodal/
@@ -83,6 +91,7 @@ pull_request_rules:
 - name: label-new-model
  description: Automatically apply new-model label
  conditions:
+    - label != stale
    - and:
      - files~=^vllm/model_executor/models/
      - files=vllm/model_executor/models/registry.py
@@ -94,11 +103,12 @@ pull_request_rules:
 - name: label-performance
  description: Automatically apply performance label
  conditions:
+    - label != stale
    - or:
      - files~=^benchmarks/
      - files~=^vllm/benchmarks/
      - files~=^tests/benchmarks/
-      - files~=^\.buildkite/nightly-benchmarks/
+      - files~=^\.buildkite/performance-benchmarks/
  actions:
    label:
      add:
@@ -107,6 +117,7 @@ pull_request_rules:
 - name: label-qwen
  description: Automatically apply qwen label
  conditions:
+    - label != stale
    - or:
      - files~=^examples/.*qwen.*\.py
      - files~=^tests/.*qwen.*\.py
@@ -121,6 +132,7 @@ pull_request_rules:
 - name: label-gpt-oss
  description: Automatically apply gpt-oss label
  conditions:
+    - label != stale
    - or:
      - files~=^examples/.*gpt[-_]?oss.*\.py
      - files~=^tests/.*gpt[-_]?oss.*\.py
@@ -139,9 +151,27 @@ pull_request_rules:
      add:
        - gpt-oss
+- name: label-nvidia
+  description: Automatically apply nvidia label
+  conditions:
+    - label != stale
+    - or:
+      - files~=cuda
+      - files~=cutlass
+      - files~=flashinfer
+      - files~=trtllm
+      - title~=(?i)NVIDIA
+      - title~=(?i)CUDA
+      - title~=(?i)CUTLASS
+  actions:
+    label:
+      add:
+        - nvidia
 - name: label-rocm
  description: Automatically apply rocm label
  conditions:
+    - label != stale
    - or:
      - files~=^csrc/rocm/
      - files~=^docker/Dockerfile.rocm
@@ -162,6 +192,7 @@ pull_request_rules:
 - name: label-structured-output
  description: Automatically apply structured-output label
  conditions:
+    - label != stale
    - or:
      - files~=^benchmarks/structured_schemas/
      - files=benchmarks/benchmark_serving_structured_output.py
@@ -181,6 +212,7 @@ pull_request_rules:
 - name: label-speculative-decoding
  description: Automatically apply speculative-decoding label
  conditions:
+    - label != stale
    - or:
      - files~=^vllm/v1/spec_decode/
      - files~=^tests/v1/spec_decode/
@@ -196,6 +228,7 @@ pull_request_rules:
 - name: label-v1
  description: Automatically apply v1 label
  conditions:
+    - label != stale
    - or:
      - files~=^vllm/v1/
      - files~=^tests/v1/
@@ -208,6 +241,7 @@ pull_request_rules:
  description: Automatically apply tpu label
  # Keep this list in sync with `label-tpu-remove` conditions
  conditions:
+    - label != stale
    - or:
      - files~=tpu.py
      - files~=_tpu
@@ -223,6 +257,7 @@ pull_request_rules:
  description: Automatically remove tpu label
  # Keep this list in sync with `label-tpu` conditions
  conditions:
+    - label != stale
    - and:
      - -files~=tpu.py
      - -files~=_tpu
@@ -237,9 +272,9 @@ pull_request_rules:
 - name: label-tool-calling
  description: Automatically add tool-calling label
  conditions:
+    - label != stale
    - or:
      - files~=^tests/tool_use/
-      - files~=^tests/mistral_tool_use/
      - files~=^tests/entrypoints/openai/tool_parsers/
      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
      - files~=^vllm/entrypoints/openai/tool_parsers/
@@ -256,8 +291,9 @@ pull_request_rules:
 - name: ping author on conflicts and add 'needs-rebase' label
  conditions:
-      - conflict
+    - label != stale
-      - -closed
+    - conflict
+    - -closed
  actions:
    label:
      add:
@@ -271,10 +307,12 @@ pull_request_rules:
 - name: assign reviewer for tensorizer changes
  conditions:
+    - label != stale
+    - or:
      - files~=^vllm/model_executor/model_loader/tensorizer.py
      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
-      - files~=^tests/tensorizer_loader/
+      - files~=^tests/model_executor/model_loader/tensorizer_loader/
  actions:
    assign:
      users:
@@ -282,6 +320,7 @@ pull_request_rules:
 - name: assign reviewer for modelopt changes
  conditions:
+    - label != stale
    - or:
        - files~=^vllm/model_executor/layers/quantization/modelopt\.py$
        - files~=^vllm/model_executor/layers/quantization/__init__\.py$
@@ -296,8 +335,8 @@ pull_request_rules:
 - name: remove 'needs-rebase' label when conflict is resolved
  conditions:
-      - -conflict
+    - -conflict
-      - -closed
+    - -closed
  actions:
    label:
      remove:
@@ -306,6 +345,7 @@ pull_request_rules:
 - name: label-kv-connector
  description: Automatically apply kv-connector label
  conditions:
+    - label != stale
    - or:
      - files~=^examples/online_serving/disaggregated[^/]*/.*
      - files~=^examples/offline_inference/disaggregated[^/]*/.*

--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -13,7 +13,7 @@ jobs:
    steps:
      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
      - name: Set up Python
        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0

--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@@ -13,6 +13,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Label issues based on keywords
+        id: label-step
        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
        with:
          script: |
@@ -42,7 +43,6 @@ jobs:
                    searchIn: "body"
                  },
                ],
                // Substring search - matches anywhere in text (partial matches)
                substrings: [
                  {
@@ -89,14 +89,12 @@ jobs:
                    term: "hip_",
                    searchIn: "both"
                  },
                  // ROCm tools and libraries
                  {
                    term: "hipify",
                    searchIn: "both"
                  },
                ],
                // Regex patterns - for complex pattern matching
                regexPatterns: [
                  {
@@ -107,13 +105,42 @@ jobs:
                  }
                ],
              },
+              cpu: {
+                // Keyword search - matches whole words only (with word boundaries)
+                keywords: [
+                  {
+                    term: "CPU Backend",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "x86",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "ARM",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "Apple Silicon",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "IBM Z",
+                    searchIn: "title"
+                  },
+                ],
+              },
+              // Add more label configurations here as needed
+              // example: {
+              //   keywords: [...],
+              //   substrings: [...],
+              //   regexPatterns: [...]
+              // },
            };
            // Helper function to create regex based on search type
            function createSearchRegex(term, type) {
              // Escape special regex characters in the term
              const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
              switch (type) {
                case 'keyword':
                  // Word boundary search - matches whole words only
@@ -125,16 +152,13 @@ jobs:
                  throw new Error(`Unknown search type: ${type}`);
              }
            }
            // Helper function to find matching terms in text with line information
            function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
              const matches = [];
              const lines = text.split('\n');
              for (const termConfig of searchTerms) {
                let regex;
                let term, searchIn, pattern, description, flags;
                // Handle different input formats (string or object)
                if (typeof termConfig === 'string') {
                  term = termConfig;
@@ -146,21 +170,17 @@ jobs:
                  description = termConfig.description;
                  flags = termConfig.flags;
                }
                // Skip if this term shouldn't be searched in the current location
                if (searchIn !== 'both' && searchIn !== searchLocation) {
                  continue;
                }
                // Create appropriate regex
                if (searchType === 'regex') {
                  regex = new RegExp(pattern, flags || "gi");
                } else {
                  regex = createSearchRegex(term, searchType);
                }
                const termMatches = [];
                // Check each line for matches
                lines.forEach((line, lineIndex) => {
                  const lineMatches = line.match(regex);
@@ -175,15 +195,14 @@ jobs:
                        originalTerm: term || pattern,
                        description: description,
                        // Show context around the match in the line
-                        context: line.length > 100 ? 
+                        context: line.length > 100 ?
-                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), 
+                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
-                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' 
+                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
                          : line.trim()
                      });
                    });
                  }
                });
                if (termMatches.length > 0) {
                  matches.push({
                    term: term || (description || pattern),
@@ -196,64 +215,48 @@ jobs:
                  });
                }
              }
              return matches;
            }
            // Helper function to check if label should be added
            async function processLabel(labelName, config) {
              const body = context.payload.issue.body || "";
              const title = context.payload.issue.title || "";
              core.notice(`Processing label: ${labelName}`);
              core.notice(`Issue Title: "${title}"`);
              core.notice(`Issue Body length: ${body.length} characters`);
              let shouldAddLabel = false;
              let allMatches = [];
              let reason = '';
              const keywords = config.keywords || [];
              const substrings = config.substrings || [];
              const regexPatterns = config.regexPatterns || [];
              core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
              // Search in title
              if (title.trim()) {
                core.notice(`Searching in title: "${title}"`);
                const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
                const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
                const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
                allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
              }
              // Search in body
              if (body.trim()) {
                core.notice(`Searching in body (${body.length} characters)`);
                const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
                const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
                const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
                allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
              }
              if (allMatches.length > 0) {
                core.notice(`Found ${allMatches.length} matching term(s):`);
                for (const termMatch of allMatches) {
                  const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
                  const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
                  if (termMatch.searchType === 'regex') {
                    core.notice(`  📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
                  } else {
                    core.notice(`  📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
                  }
                  // Show details for each match
                  termMatch.matches.forEach((match, index) => {
                    core.notice(`    ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
@@ -266,7 +269,6 @@ jobs:
                    }
                  });
                }
                shouldAddLabel = true;
                const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
                const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
@@ -274,13 +276,10 @@ jobs:
                const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
                const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
                const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
                reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
              }
              core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
              core.notice(`Reason: ${reason || 'No matching terms found'}`);
              if (shouldAddLabel) {
                const existingLabels = context.payload.issue.labels.map(l => l.name);
                if (!existingLabels.includes(labelName)) {
@@ -296,14 +295,92 @@ jobs:
                core.notice(`Label "${labelName}" already present.`);
                return false;
              }
              core.notice(`No matching terms found for label "${labelName}".`);
              return false;
            }
            // Process all configured labels
-            const processLabels = Object.entries(labelConfig)
+            const labelsAddedResults = await Promise.all(
-              .map(([labelName, config]) => processLabel(labelName, config));
+              Object.entries(labelConfig).map(([labelName, config]) => 
-            const labelsAdded = await Promise.all(processLabels);
+                processLabel(labelName, config).then(added => ({ labelName, added }))
-            const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
+              )
-            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
+            );
\ No newline at end of file
+            const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
+            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
+            // Return which labels were added for the next step
+            const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
+            core.setOutput('labels_added', JSON.stringify(addedLabels));
+            return addedLabels;
+      - name: CC users for labeled issues
+        if: steps.label-step.outputs.labels_added != '[]'
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
+        with:
+          script: |
+            // Configuration: Map labels to GitHub users to CC
+            // You can add multiple users per label, and multiple label configurations
+            const ccConfig = {
+              rocm: {
+                users: ['hongxiayang', 'tjtanaa', 'vllmellm'],  // Add more users as needed: ['user1', 'user2', 'user3']
+                message: 'CC {users} for ROCm-related issue'  // {users} will be replaced with @mentions
+              },
+              // Add more label -> user mappings here
+              // Example:
+              // cuda: {
+              //   users: ['user1', 'user2'],
+              //   message: 'CC {users} for CUDA-related issue'
+              // },
+              // performance: {
+              //   users: ['perfexpert'],
+              //   message: 'CC {users} for performance issue'
+              // },
+            };
+            const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
+            core.notice(`Labels added: ${labelsAdded.join(', ')}`);
+            // Get existing comments to check for already mentioned users
+            const comments = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+            });
+            const issueBody = context.payload.issue.body || '';
+            const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
+            // Process each label that was added
+            for (const label of labelsAdded) {
+              if (ccConfig[label]) {
+                const config = ccConfig[label];
+                const usersToMention = [];
+                // Check which users haven't been mentioned yet
+                for (const user of config.users) {
+                  const mentionPattern = new RegExp(`@${user}\\b`, 'i');
+                  if (!mentionPattern.test(allExistingText)) {
+                    usersToMention.push(user);
+                  } else {
+                    core.notice(`@${user} already mentioned for label "${label}", skipping`);
+                  }
+                }
+                // Post comment if there are users to mention
+                if (usersToMention.length > 0) {
+                  const mentions = usersToMention.map(u => `@${u}`).join(' ');
+                  const message = config.message.replace('{users}', mentions);
+                  await github.rest.issues.createComment({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    issue_number: context.issue.number,
+                    body: message
+                  });
+                  core.notice(`CC comment added for label "${label}": ${mentions}`);
+                } else {
+                  core.notice(`All users for label "${label}" already mentioned, skipping comment`);
+                }
+              }
+            }
\ No newline at end of file
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
+name: macOS Apple Silicon Smoke Test
+on:
+  push:
+    branches:
+      - main
+  workflow_dispatch:  # Manual trigger
+jobs:
+  macos-m1-smoke-test:
+    runs-on: macos-latest
+    timeout-minutes: 30
+    steps:
+      - uses: actions/checkout@v6
+      - uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+          cache-dependency-glob: |
+            requirements/**/*.txt
+            pyproject.toml
+          python-version: '3.12'
+      - name: Create virtual environment
+        run: |
+          uv venv
+          echo "$GITHUB_WORKSPACE/.venv/bin" >> "$GITHUB_PATH"
+      - name: Install dependencies and build vLLM
+        run: |
+          uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
+          uv pip install -e .
+        env:
+          CMAKE_BUILD_PARALLEL_LEVEL: 4
+      - name: Verify installation
+        run: |
+          python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
+      - name: Smoke test vllm serve
+        run: |
+          # Start server in background
+          vllm serve Qwen/Qwen3-0.6B \
+            --max-model-len=2K \
+            --load-format=dummy \
+            --hf-overrides '{"num_hidden_layers": 2}' \
+            --enforce-eager \
+            --port 8000 &
+          SERVER_PID=$!
+          # Wait for server to start
+          for i in {1..30}; do
+            if curl -s http://localhost:8000/health > /dev/null; then
+              echo "Server started successfully"
+              break
+            fi
+            if [ "$i" -eq 30 ]; then
+              echo "Server failed to start"
+              kill "$SERVER_PID"
+              exit 1
+            fi
+            sleep 2
+          done
+          # Test health endpoint
+          curl -f http://localhost:8000/health
+          # Test completion
+          curl -f http://localhost:8000/v1/completions \
+            -H "Content-Type: application/json" \
+            -d '{
+              "model": "Qwen/Qwen3-0.6B",
+              "prompt": "Hello",
+              "max_tokens": 5
+            }'
+          # Cleanup
+          kill "$SERVER_PID"
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -16,7 +16,7 @@ jobs:
  pre-commit:
    runs-on: ubuntu-latest
    steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
    - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
      with:
        python-version: "3.12"

--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -13,7 +13,7 @@ jobs:
      actions: write
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
+      - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
        with:
          # Increasing this value ensures that changes to this workflow
          # propagate to all issues and PRs in days rather than months

--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,9 @@
 # vllm-flash-attn built from source
 vllm/vllm_flash_attn/*
+# OpenAI triton kernels copied from source
+vllm/third_party/triton_kernels/*
 # triton jit
 .triton
@@ -94,6 +97,9 @@ ipython_config.py
 # generated files
 **/generated/**
+# uv
+uv.lock
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
@@ -218,3 +224,6 @@ csrc/moe/marlin_moe_wna16/kernel_*
 # Ignore ep_kernels_workspace folder
 ep_kernels_workspace/
+# Allow tracked library source folders under submodules (e.g., benchmarks/lib)
+!vllm/benchmarks/lib/
--- a/.markdownlint.yaml
+++ b/.markdownlint.yaml
@@ -3,11 +3,9 @@ MD007:
 MD013: false
 MD024:
  siblings_only: true
+MD031:
+  list_items: false
 MD033: false
-MD042: false
-MD045: false
 MD046: false
-MD051: false
 MD052: false
-MD053: false
 MD059: false
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,30 +6,19 @@ default_stages:
  - manual # Run in CI
 exclude: 'vllm/third_party/.*'
 repos:
- repo: https://github.com/google/yapf
-  rev: v0.43.0
-  hooks:
-  - id: yapf
-    args: [--in-place, --verbose]
-    # Keep the same list from yapfignore here to avoid yapf failing without any inputs
-    exclude: '(.buildkite|benchmarks|build|examples)/.*'
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.11.7
+  rev: v0.14.0
  hooks:
-  - id: ruff
+  - id: ruff-check
    args: [--output-format, github, --fix]
  - id: ruff-format
-    files: ^(.buildkite|benchmarks|examples)/.*
 - repo: https://github.com/crate-ci/typos
-  rev: v1.35.5
+  rev: v1.38.1
  hooks:
  - id: typos
- repo: https://github.com/PyCQA/isort
+    args: [--force-exclude]
-  rev: 6.0.1
-  hooks:
-  - id: isort
 - repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v20.1.3
+  rev: v21.1.2
  hooks:
  - id: clang-format
    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
@@ -46,32 +35,27 @@ repos:
  hooks:
  - id: actionlint
 - repo: https://github.com/astral-sh/uv-pre-commit
-  rev: 0.6.17
+  rev: 0.9.1
  hooks:
    - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
+      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28, --python-version, "3.12"]
      files: ^requirements/test\.(in|txt)$
 - repo: local
  hooks:
  - id: format-torch-nightly-test
    name: reformat nightly_torch_test.txt to be in sync with test.in
    language: python
-    entry: python tools/generate_nightly_torch_test.py
+    entry: python tools/pre_commit/generate_nightly_torch_test.py
    files: ^requirements/test\.(in|txt)$
  - id: mypy-local
-    name: Run mypy for local Python installation
+    name: Run mypy locally for lowest supported Python version
-    entry: python tools/pre_commit/mypy.py 0 "local"
+    entry: python tools/pre_commit/mypy.py 0 "3.10"
    stages: [pre-commit] # Don't run in CI
    <<: &mypy_common
      language: python
      types_or: [python, pyi]
      require_serial: true
      additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
-  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
-    name: Run mypy for Python 3.9
-    entry: python tools/pre_commit/mypy.py 1 "3.9"
-    <<: *mypy_common
-    stages: [manual] # Only run in CI
  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
    name: Run mypy for Python 3.10
    entry: python tools/pre_commit/mypy.py 1 "3.10"
@@ -87,14 +71,19 @@ repos:
    entry: python tools/pre_commit/mypy.py 1 "3.12"
    <<: *mypy_common
    stages: [manual] # Only run in CI
+  - id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+    name: Run mypy for Python 3.13
+    entry: python tools/pre_commit/mypy.py 1 "3.13"
+    <<: *mypy_common
+    stages: [manual] # Only run in CI
  - id: shellcheck
    name: Lint shell scripts
-    entry: tools/shellcheck.sh
+    entry: tools/pre_commit/shellcheck.sh
    language: script
    types: [shell]
  - id: png-lint
    name: Lint PNG exports from excalidraw
-    entry: tools/png-lint.sh
+    entry: tools/pre_commit/png-lint.sh
    language: script
    types: [png]
  - id: signoff-commit
@@ -111,12 +100,12 @@ repos:
    stages: [commit-msg]
  - id: check-spdx-header
    name: Check SPDX headers
-    entry: python tools/check_spdx_header.py
+    entry: python tools/pre_commit/check_spdx_header.py
    language: python
    types: [python]
  - id: check-root-lazy-imports
    name: Check root lazy imports
-    entry: python tools/check_init_lazy_imports.py
+    entry: python tools/pre_commit/check_init_lazy_imports.py
    language: python
    types: [python]
  - id: check-filenames
@@ -130,11 +119,11 @@ repos:
    pass_filenames: false
  - id: update-dockerfile-graph
    name: Update Dockerfile dependency graph
-    entry: tools/update-dockerfile-graph.sh
+    entry: tools/pre_commit/update-dockerfile-graph.sh
    language: script
  - id: enforce-import-regex-instead-of-re
    name: Enforce import regex as re
-    entry: python tools/enforce_regex_import.py
+    entry: python tools/pre_commit/enforce_regex_import.py
    language: python
    types: [python]
    pass_filenames: false
@@ -142,7 +131,7 @@ repos:
  # forbid directly import triton
  - id: forbid-direct-triton-import
    name: "Forbid direct 'import triton'"
-    entry: python tools/check_triton_import.py
+    entry: python tools/pre_commit/check_triton_import.py
    language: python
    types: [python]
    pass_filenames: false
@@ -155,7 +144,7 @@ repos:
    additional_dependencies: [regex]
  - id: validate-config
    name: Validate configuration has default values and that each field has a docstring
-    entry: python tools/validate_config.py
+    entry: python tools/pre_commit/validate_config.py
    language: python
    additional_dependencies: [regex]
  # Keep `suggestion` last

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,11 +39,18 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
+set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx906;gfx926;gfx928;gfx936")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151";gfx906;gfx926;gfx928;gfx936;gfx938)
+# ROCm installation prefix. Default to /opt/rocm but allow override via
+# -DROCM_PATH=/your/rocm/path when invoking cmake.
+if(NOT DEFINED ROCM_PATH)
+  set(ROCM_PATH "/opt/rocm" CACHE PATH "ROCm installation prefix")
+else()
+  set(ROCM_PATH ${ROCM_PATH} CACHE PATH "ROCm installation prefix" FORCE)
+endif()
 #
 # Supported/expected torch versions for CUDA/ROCm.
 #
@@ -54,8 +61,8 @@ set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")
 #
 # Try to find python package with an executable that exactly matches
@@ -91,6 +98,9 @@ find_package(Torch REQUIRED)
 # Supported NVIDIA architectures.
 # This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
 if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
+   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
+  set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0")
+elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
  set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
 else()
@@ -131,7 +141,7 @@ elseif(HIP_FOUND)
  # ROCm 5.X and 6.X
  if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
-      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
+      Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM})
    message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
      "expected for ROCm build, saw ${Torch_VERSION} instead.")
  endif()
@@ -180,6 +190,15 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
 endif()
+#
+# Set compression mode for CUDA >=13.x.
+#
+if(VLLM_GPU_LANG STREQUAL "CUDA" AND
+   DEFINED CMAKE_CUDA_COMPILER_VERSION AND
+   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
+  list(APPEND VLLM_GPU_FLAGS "--compress-mode=size")
+endif()
 #
 # Set CUDA include flags for CXX compiler.
 #
@@ -230,11 +249,28 @@ set_gencode_flags_for_srcs(
  SRCS "${VLLM_CUMEM_EXT_SRC}"
  CUDA_ARCHS "${CUDA_ARCHS}")
-if(VLLM_GPU_LANG STREQUAL "CUDA")
+if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
  message(STATUS "Enabling cumem allocator extension.")
-  # link against cuda driver library
+  if(VLLM_GPU_LANG STREQUAL "CUDA")
-  list(APPEND CUMEM_LIBS CUDA::cuda_driver)
+    # link against cuda driver library
-  define_gpu_extension_target(
+    list(APPEND CUMEM_LIBS CUDA::cuda_driver)
+  else()
+    # link against rocm driver library. Prefer an absolute path to
+    # libamdhip64.so inside ${ROCM_PATH}/lib if available, otherwise fall
+    # back to linking by name "amdhip64".
+    find_library(AMDHIP64_LIB
+      NAMES amdhip64 libamdhip64.so
+      PATHS ${ROCM_PATH}/lib
+      NO_DEFAULT_PATH)
+    if(AMDHIP64_LIB)
+      message(STATUS "Found libamdhip64 at ${AMDHIP64_LIB}")
+      list(APPEND CUMEM_LIBS ${AMDHIP64_LIB})
+    else()
+      message(WARNING "libamdhip64 not found in ${ROCM_PATH}/lib; falling back to linking 'amdhip64' by name")
+      list(APPEND CUMEM_LIBS amdhip64)
+    endif()
+  endif()
+  define_extension_target(
    cumem_allocator
    DESTINATION vllm
    LANGUAGE CXX
@@ -262,16 +298,14 @@ set(VLLM_EXT_SRC
  "csrc/opt/activation_kernels_opt.cu"
  "csrc/attention/attention_kernels_opt.cu"
  "csrc/attention/attention_kernels_opt_tc.cu"
-  "csrc/attention/attention_with_mask_kernels.cu"
-  "csrc/attention/attention_with_mask_kernels_opt.cu"
-  "csrc/attention/attention_with_mask_kernels_opt_tc.cu"
  "csrc/opt/layernorm_kernels_opt.cu"
+  "csrc/fused_qknorm_rope_kernel.cu"
  # "csrc/layernorm_quant_kernels.cu"
  "csrc/sampler.cu"
  "csrc/cuda_view.cu"
  # "csrc/quantization/gptq/q_gemm.cu"
-  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
+  "csrc/quantization/w8a8/int8/scaled_quant.cu"
-  # "csrc/quantization/fp8/common.cu"
+  # "csrc/quantization/w8a8/fp8/common.cu"
  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
  "csrc/quantization/gguf/gguf_kernel.cu"
  # "csrc/quantization/activation_kernels.cu"
@@ -283,7 +317,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v4.2.1")
  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -315,13 +349,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_EXT_SRC
    "csrc/quantization/awq/gemm_kernels.cu"
    "csrc/permute_cols.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+    "csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
    "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
-    "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
    "csrc/cutlass_extensions/common.cpp"
-    "csrc/quantization/fp8/per_token_group_quant.cu")
+    "csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
+    "csrc/quantization/w8a8/int8/per_token_group_quant.cu")
  set_gencode_flags_for_srcs(
    SRCS "${VLLM_EXT_SRC}"
@@ -330,8 +364,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # Only build Marlin kernels if we are building for at least some compatible archs.
  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
  # are not supported by Machete yet.
-  # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
+  # marlin arches for fp16 output
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+  # marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
+  cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+  # marlin arches for fp8 input
+  # - sm80 doesn't support fp8 computation
+  # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
+  # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
+  cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
  if (MARLIN_ARCHS)
    #
@@ -341,16 +384,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set(MARLIN_GEN_SCRIPT
      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
    file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
+    list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
+    set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
-    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
-    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
-    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
+    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
-        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
+        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
      execute_process(
        COMMAND ${CMAKE_COMMAND} -E env
        PYTHONPATH=$PYTHONPATH
-          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
+          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
        RESULT_VARIABLE marlin_generation_result
        OUTPUT_VARIABLE marlin_generation_result
        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
@@ -363,15 +408,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                            "\nCheck the log for details: "
                            "${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
      else()
-        set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
+        set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
-            CACHE STRING "Last run Marlin generate script hash" FORCE)
+            CACHE STRING "Last run Marlin generate script hash and arch" FORCE)
        message(STATUS "Marlin generation completed successfully.")
      endif()
    else()
      message(STATUS "Marlin generation script has not changed, skipping generation.")
    endif()
-    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
+    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
    set_gencode_flags_for_srcs(
      SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
      CUDA_ARCHS "${MARLIN_ARCHS}")
@@ -379,12 +424,34 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
    endif()
    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
+    file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
+      CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+      set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
+        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    endif()
+    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
+    if (MARLIN_FP8_ARCHS) 
+      file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
+        CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_TEMPLATE_FP8_KERNEL_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_FP8_KERNEL_SRC})
+    endif()
    set(MARLIN_SRCS
       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
       "csrc/quantization/gptq_marlin/gptq_marlin.cu"
+       "csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
       "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
       "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
    set_gencode_flags_for_srcs(
@@ -425,11 +492,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
    set(SRCS
-       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
+       "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
+       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
+       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
+       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
-       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
+       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -453,12 +520,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
  # CUDA 12.8 or later
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
+  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS
-      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
+      "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
-      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
+      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
-      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
+      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
    )
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@@ -483,12 +554,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
  # require CUDA 12.8 or later
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
+  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS
-      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
+      "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
-      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
+      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
-      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
+      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
    )
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@@ -519,7 +594,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # subtract out the archs that are already built for 3x
  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
  if (SCALED_MM_2X_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
+    set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
@@ -563,17 +638,24 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
  # CUDA 12.8 or later
-  cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
+  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
      "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
-      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
+      "csrc/quantization/fp4/nvfp4_experts_quant.cu"
+      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
+      "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${FP4_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
    message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
  else()
    message(STATUS "Not building NVFP4 as no compatible archs were found.")
@@ -582,7 +664,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()
  # FP4 Archs and flags
-  cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
+  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
@@ -604,7 +690,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()
  # CUTLASS MLA Archs and flags
-  cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
    set(SRCS
      "csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
@@ -630,7 +720,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # if it's possible to compile MoE kernels that use its output.
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
+    set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -648,9 +738,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
+  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
+    set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -669,9 +763,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()
  # moe_data.cu is used by all CUTLASS MoE kernels.
-  cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
+    set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
@@ -688,9 +786,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()
-  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
+  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
+    set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -805,7 +907,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()
  # Hadacore kernels
-  cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(HADACORE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
  if(HADACORE_ARCHS)
    set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
    set_gencode_flags_for_srcs(
@@ -827,7 +929,7 @@ if (VLLM_GPU_LANG STREQUAL "HIP")
 endif()
 message(STATUS "Enabling C extension.")
-define_gpu_extension_target(
+define_extension_target(
  _C
  DESTINATION vllm
  LANGUAGE ${VLLM_GPU_LANG}
@@ -852,10 +954,10 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 set(VLLM_MOE_EXT_SRC
  "csrc/moe/torch_bindings.cpp"
  "csrc/moe/moe_align_sum_kernels.cu"
+  "csrc/moe/moe_lora_align_sum_kernels.cu"
  "csrc/moe/topk_softmax_kernels.cu"
  "csrc/moe/moe_fused_gate.cu")
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_MOE_EXT_SRC
    "csrc/moe/moe_wna16.cu"
@@ -883,8 +985,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    CUDA_ARCHS "${CUDA_ARCHS}")
  list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
-  # 9.0 for latest bf16 atomicAdd PTX
+  # moe marlin arches
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
+  # note that we always set `use_atomic_add=False` for moe marlin now,
+  # so we don't need 9.0 for bf16 atomicAdd PTX
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+  # moe marlin arches for fp8 input
+  # - sm80 doesn't support fp8 computation
+  # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
+  # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
+  cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
  if (MARLIN_MOE_ARCHS)
    #
@@ -894,16 +1003,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set(MOE_MARLIN_GEN_SCRIPT
      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
    file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
+    list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
+    set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MOE_MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
-    message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Marlin MOE generation script hash with arch: ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
-    message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Last run Marlin MOE generate script hash with arch: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
-    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
+    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
-        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
+        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
      execute_process(
        COMMAND ${CMAKE_COMMAND} -E env
        PYTHONPATH=$PYTHONPATH
-          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
+          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
        RESULT_VARIABLE moe_marlin_generation_result
        OUTPUT_VARIABLE moe_marlin_generation_output
        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
@@ -916,7 +1027,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                            "\nCheck the log for details: "
                            "${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
      else()
-        set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
+        set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
            CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
        message(STATUS "Marlin MOE generation completed successfully.")
      endif()
@@ -924,16 +1035,28 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
    endif()
-    file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
+    file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
+    list(APPEND MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
    set_gencode_flags_for_srcs(
-      SRCS "${MOE_WNAA16_MARLIN_SRC}"
+      SRCS "${MARLIN_MOE_SRC}"
      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
+      set_source_files_properties(${MARLIN_MOE_SRC}
        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
    endif()
+    list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
-    list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
+    if (MARLIN_MOE_FP8_ARCHS)
+      file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_MOE_FP8_SRC}"
+        CUDA_ARCHS "${MARLIN_MOE_FP8_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_MOE_FP8_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
+    endif()
    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
  else()
@@ -943,7 +1066,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 endif()
 message(STATUS "Enabling moe extension.")
-define_gpu_extension_target(
+define_extension_target(
  _moe_C
  DESTINATION vllm
  LANGUAGE ${VLLM_GPU_LANG}
@@ -965,7 +1088,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
    "csrc/rocm/skinny_gemms.cu"
    "csrc/rocm/attention.cu")
-  define_gpu_extension_target(
+  define_extension_target(
    _rocm_C
    DESTINATION vllm
    LANGUAGE ${VLLM_GPU_LANG}
@@ -977,9 +1100,15 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
 endif()
 ]]
+# For CUDA and HIP builds also build the triton_kernels external package.
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+    include(cmake/external_projects/triton_kernels.cmake)
+endif()
 # For CUDA we also build and ship some external projects.
 if (VLLM_GPU_LANG STREQUAL "CUDA")
    include(cmake/external_projects/flashmla.cmake)
+    include(cmake/external_projects/qutlass.cmake)
    # vllm-flash-attn should be last as it overwrites some CMake functions
    include(cmake/external_projects/vllm_flash_attn.cmake)

--- a/README.md
+++ b/README.md
@@ -97,7 +97,7 @@ python3 setup.py install （若调试，可使用python3 setup.py develop）
 + 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
 ## 验证
- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.11.0;
+- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.12.0;
 ## Known Issue
 - 无

--- a/README_ORIGIN.md
+++ b/README_ORIGIN.md
@@ -21,6 +21,11 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
 *Latest News* 🔥
+- [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing).
+- [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
+- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
+- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
+- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
 - [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
 - [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
 - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
@@ -81,7 +86,7 @@ vLLM is flexible and easy to use with:
 - Tensor, pipeline, data and expert parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
+- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, Arm CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
 - Prefix caching support
 - Multi-LoRA support
@@ -148,6 +153,7 @@ Compute Resources:
 - Trainy
 - UC Berkeley
 - UC San Diego
+- Volcengine
 Slack Sponsor: Anyscale